This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
house= read.csv('kc_house_data.csv',TRUE,",")
df<-subset(house,select=-c(id))
head(df)
Appropriate function to extract the year and month into separate variables
df$year=substr(df$date,0,4)
df$month=substr(df$date,5,6)
df<-subset(df,select=-c(date))
Run the models
null_model<-lm(price~1,data=df)
summary(null_model)
##
## Call:
## lm(formula = price ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465088 -218138 -90088 104912 7159912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 540088 2497 216.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
##
## Call:
## lm(formula = price ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1320439 -98332 -9168 77291 4332206
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.590e+06 2.922e+06 2.255 0.02413 *
## bedrooms -3.593e+04 1.886e+03 -19.052 < 2e-16 ***
## bathrooms 4.130e+04 3.244e+03 12.732 < 2e-16 ***
## sqft_living 1.503e+02 4.372e+00 34.391 < 2e-16 ***
## sqft_lot 1.298e-01 4.778e-02 2.716 0.00661 **
## floors 6.969e+03 3.585e+03 1.944 0.05191 .
## waterfront 5.833e+05 1.730e+04 33.705 < 2e-16 ***
## view 5.267e+04 2.133e+03 24.687 < 2e-16 ***
## condition 2.771e+04 2.348e+03 11.798 < 2e-16 ***
## grade 9.606e+04 2.146e+03 44.756 < 2e-16 ***
## sqft_above 3.131e+01 4.347e+00 7.203 6.07e-13 ***
## sqft_basement NA NA NA NA
## yr_built -2.614e+03 7.243e+01 -36.084 < 2e-16 ***
## yr_renovated 2.076e+01 3.646e+00 5.694 1.25e-08 ***
## zipcode -5.842e+02 3.288e+01 -17.764 < 2e-16 ***
## lat 6.050e+05 1.070e+04 56.525 < 2e-16 ***
## long -2.153e+05 1.309e+04 -16.447 < 2e-16 ***
## sqft_living15 2.149e+01 3.437e+00 6.251 4.16e-10 ***
## sqft_lot15 -3.905e-01 7.305e-02 -5.345 9.11e-08 ***
## year2015 6.003e+04 9.231e+03 6.503 8.03e-11 ***
## month02 9.277e+03 8.567e+03 1.083 0.27890
## month03 3.508e+04 7.914e+03 4.433 9.34e-06 ***
## month04 3.662e+04 7.696e+03 4.758 1.96e-06 ***
## month05 5.431e+04 1.017e+04 5.338 9.51e-08 ***
## month06 6.139e+04 1.203e+04 5.102 3.39e-07 ***
## month07 5.739e+04 1.202e+04 4.773 1.83e-06 ***
## month08 5.974e+04 1.213e+04 4.926 8.46e-07 ***
## month09 5.404e+04 1.221e+04 4.427 9.58e-06 ***
## month10 6.151e+04 1.216e+04 5.060 4.23e-07 ***
## month11 5.722e+04 1.245e+04 4.598 4.29e-06 ***
## month12 5.190e+04 1.240e+04 4.186 2.85e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200600 on 21583 degrees of freedom
## Multiple R-squared: 0.7019, Adjusted R-squared: 0.7015
## F-statistic: 1753 on 29 and 21583 DF, p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start: AIC=553875.8
## price ~ 1
##
## Df Sum of Sq RSS AIC
## + sqft_living 1 1.4356e+15 1.4773e+15 539204
## + grade 1 1.2976e+15 1.6153e+15 541134
## + sqft_above 1 1.0682e+15 1.8447e+15 544004
## + sqft_living15 1 9.9816e+14 1.9148e+15 544810
## + bathrooms 1 8.0329e+14 2.1096e+15 546904
## + view 1 4.5978e+14 2.4531e+15 550165
## + sqft_basement 1 3.0544e+14 2.6075e+15 551484
## + bedrooms 1 2.7696e+14 2.6360e+15 551718
## + lat 1 2.7455e+14 2.6384e+15 551738
## + waterfront 1 2.0668e+14 2.7062e+15 552287
## + floors 1 1.9209e+14 2.7208e+15 552403
## + yr_renovated 1 4.6564e+13 2.8664e+15 553529
## + sqft_lot 1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15 1 1.9801e+13 2.8931e+15 553730
## + yr_built 1 8.4977e+12 2.9044e+15 553815
## + zipcode 1 8.2451e+12 2.9047e+15 553817
## + condition 1 3.8514e+12 2.9091e+15 553849
## + month 11 4.6632e+12 2.9083e+15 553863
## + long 1 1.3624e+12 2.9116e+15 553868
## <none> 2.9129e+15 553876
## + year 1 3.7251e+10 2.9129e+15 553878
##
## Step: AIC=539203.5
## price ~ sqft_living
##
## Df Sum of Sq RSS AIC
## + lat 1 2.1314e+14 1.2641e+15 535838
## + view 1 1.2362e+14 1.3537e+15 537317
## + grade 1 1.2132e+14 1.3560e+15 537353
## + waterfront 1 1.1024e+14 1.3670e+15 537529
## + yr_built 1 9.2854e+13 1.3844e+15 537802
## + long 1 6.6817e+13 1.4105e+15 538205
## + bedrooms 1 4.0635e+13 1.4366e+15 538603
## + zipcode 1 2.2858e+13 1.4544e+15 538868
## + yr_renovated 1 2.2405e+13 1.4549e+15 538875
## + sqft_living15 1 2.0109e+13 1.4572e+15 538909
## + condition 1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15 1 6.4407e+12 1.4708e+15 539111
## + sqft_lot 1 3.0113e+12 1.4743e+15 539161
## + month 11 4.0868e+12 1.4732e+15 539166
## + year 1 1.6739e+12 1.4756e+15 539181
## + sqft_above 1 1.2165e+12 1.4761e+15 539188
## + sqft_basement 1 1.2165e+12 1.4761e+15 539188
## + floors 1 2.2991e+11 1.4770e+15 539202
## + bathrooms 1 1.4719e+11 1.4771e+15 539203
## <none> 1.4773e+15 539204
##
## Step: AIC=535838
## price ~ sqft_living + lat
##
## Df Sum of Sq RSS AIC
## + view 1 1.2663e+14 1.1375e+15 533559
## + waterfront 1 1.1646e+14 1.1477e+15 533751
## + grade 1 8.8423e+13 1.1757e+15 534273
## + yr_built 1 5.1904e+13 1.2122e+15 534934
## + long 1 3.6167e+13 1.2280e+15 535213
## + bedrooms 1 3.2254e+13 1.2319e+15 535281
## + condition 1 1.9095e+13 1.2450e+15 535511
## + yr_renovated 1 1.8897e+13 1.2452e+15 535515
## + sqft_living15 1 1.8325e+13 1.2458e+15 535524
## + year 1 2.8880e+12 1.2613e+15 535791
## + month 11 4.0367e+12 1.2601e+15 535791
## + sqft_lot15 1 1.2429e+12 1.2629e+15 535819
## + zipcode 1 4.4621e+11 1.2637e+15 535832
## <none> 1.2641e+15 535838
## + sqft_lot 1 1.0913e+11 1.2640e+15 535838
## + sqft_above 1 1.0387e+11 1.2640e+15 535838
## + sqft_basement 1 1.0387e+11 1.2640e+15 535838
## + bathrooms 1 2.2942e+09 1.2641e+15 535840
## + floors 1 2.9322e+07 1.2641e+15 535840
##
## Step: AIC=533558.7
## price ~ sqft_living + lat + view
##
## Df Sum of Sq RSS AIC
## + grade 1 7.7085e+13 1.0604e+15 532044
## + waterfront 1 4.8301e+13 1.0892e+15 532623
## + yr_built 1 2.9685e+13 1.1078e+15 532989
## + bedrooms 1 2.0105e+13 1.1174e+15 533175
## + long 1 1.8126e+13 1.1194e+15 533214
## + condition 1 1.3259e+13 1.1242e+15 533307
## + yr_renovated 1 1.1033e+13 1.1265e+15 533350
## + sqft_living15 1 9.7773e+12 1.1277e+15 533374
## + sqft_above 1 5.6493e+12 1.1319e+15 533453
## + sqft_basement 1 5.6493e+12 1.1319e+15 533453
## + month 11 3.7455e+12 1.1338e+15 533509
## + year 1 2.5256e+12 1.1350e+15 533513
## + sqft_lot15 1 1.8222e+12 1.1357e+15 533526
## + zipcode 1 1.3136e+12 1.1362e+15 533536
## + floors 1 7.9084e+11 1.1367e+15 533546
## + sqft_lot 1 3.9207e+11 1.1371e+15 533553
## + bathrooms 1 1.9270e+11 1.1373e+15 533557
## <none> 1.1375e+15 533559
##
## Step: AIC=532044.1
## price ~ sqft_living + lat + view + grade
##
## Df Sum of Sq RSS AIC
## + yr_built 1 8.9146e+13 9.7128e+14 530148
## + waterfront 1 5.0218e+13 1.0102e+15 530998
## + condition 1 2.5997e+13 1.0344e+15 531510
## + long 1 2.2309e+13 1.0381e+15 531587
## + yr_renovated 1 1.4312e+13 1.0461e+15 531752
## + bedrooms 1 1.0398e+13 1.0500e+15 531833
## + floors 1 3.9309e+12 1.0565e+15 531966
## + year 1 2.8187e+12 1.0576e+15 531989
## + month 11 3.5927e+12 1.0568e+15 531993
## + bathrooms 1 2.2781e+12 1.0581e+15 532000
## + sqft_lot15 1 1.3272e+12 1.0591e+15 532019
## + sqft_lot 1 2.0910e+11 1.0602e+15 532042
## + sqft_above 1 1.3720e+11 1.0603e+15 532043
## + sqft_basement 1 1.3720e+11 1.0603e+15 532043
## + sqft_living15 1 1.1809e+11 1.0603e+15 532044
## <none> 1.0604e+15 532044
## + zipcode 1 7.8101e+10 1.0603e+15 532045
##
## Step: AIC=530148.3
## price ~ sqft_living + lat + view + grade + yr_built
##
## Df Sum of Sq RSS AIC
## + waterfront 1 5.0449e+13 9.2083e+14 528997
## + bedrooms 1 1.1098e+13 9.6018e+14 529902
## + zipcode 1 6.4623e+12 9.6481e+14 530006
## + bathrooms 1 5.2656e+12 9.6601e+14 530033
## + condition 1 4.2739e+12 9.6700e+14 530055
## + year 1 3.3309e+12 9.6795e+14 530076
## + month 11 3.8104e+12 9.6747e+14 530085
## + long 1 2.8391e+12 9.6844e+14 530087
## + yr_renovated 1 2.3436e+12 9.6893e+14 530098
## + floors 1 2.1809e+12 9.6910e+14 530102
## + sqft_above 1 2.1769e+12 9.6910e+14 530102
## + sqft_basement 1 2.1769e+12 9.6910e+14 530102
## + sqft_lot15 1 1.1384e+12 9.7014e+14 530125
## + sqft_living15 1 6.4656e+11 9.7063e+14 530136
## + sqft_lot 1 2.8898e+11 9.7099e+14 530144
## <none> 9.7128e+14 530148
##
## Step: AIC=528997.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront
##
## Df Sum of Sq RSS AIC
## + bedrooms 1 9.0057e+12 9.1182e+14 528787
## + zipcode 1 6.3395e+12 9.1449e+14 528850
## + bathrooms 1 5.4031e+12 9.1543e+14 528872
## + condition 1 4.4331e+12 9.1639e+14 528895
## + year 1 3.4936e+12 9.1733e+14 528917
## + month 11 3.8991e+12 9.1693e+14 528928
## + long 1 2.5647e+12 9.1826e+14 528939
## + floors 1 1.6628e+12 9.1917e+14 528960
## + sqft_above 1 1.4511e+12 9.1938e+14 528965
## + sqft_basement 1 1.4511e+12 9.1938e+14 528965
## + yr_renovated 1 1.2489e+12 9.1958e+14 528970
## + sqft_lot15 1 1.1644e+12 9.1966e+14 528972
## + sqft_living15 1 9.9743e+11 9.1983e+14 528976
## + sqft_lot 1 2.2143e+11 9.2061e+14 528994
## <none> 9.2083e+14 528997
##
## Step: AIC=528787
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms
##
## Df Sum of Sq RSS AIC
## + bathrooms 1 9.0102e+12 9.0281e+14 528574
## + zipcode 1 6.9168e+12 9.0491e+14 528624
## + condition 1 5.2627e+12 9.0656e+14 528664
## + year 1 3.5695e+12 9.0825e+14 528704
## + month 11 4.0171e+12 9.0781e+14 528714
## + long 1 2.8458e+12 9.0898e+14 528721
## + sqft_lot15 1 1.9499e+12 9.0987e+14 528743
## + floors 1 1.7197e+12 9.1010e+14 528748
## + yr_renovated 1 1.1626e+12 9.1066e+14 528761
## + sqft_above 1 1.1004e+12 9.1072e+14 528763
## + sqft_basement 1 1.1004e+12 9.1072e+14 528763
## + sqft_living15 1 8.3834e+11 9.1098e+14 528769
## + sqft_lot 1 5.6135e+11 9.1126e+14 528776
## <none> 9.1182e+14 528787
##
## Step: AIC=528574.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms
##
## Df Sum of Sq RSS AIC
## + zipcode 1 7.7118e+12 8.9510e+14 528391
## + condition 1 4.8311e+12 8.9798e+14 528460
## + year 1 3.7288e+12 8.9908e+14 528487
## + month 11 4.1427e+12 8.9867e+14 528497
## + long 1 2.0291e+12 9.0078e+14 528528
## + sqft_above 1 1.6201e+12 9.0119e+14 528538
## + sqft_basement 1 1.6201e+12 9.0119e+14 528538
## + sqft_living15 1 1.4832e+12 9.0133e+14 528541
## + sqft_lot15 1 1.4206e+12 9.0139e+14 528542
## + yr_renovated 1 4.2334e+11 9.0239e+14 528566
## + floors 1 3.9100e+11 9.0242e+14 528567
## + sqft_lot 1 3.6234e+11 9.0245e+14 528568
## <none> 9.0281e+14 528574
##
## Step: AIC=528391
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode
##
## Df Sum of Sq RSS AIC
## + long 1 9.6955e+12 8.8540e+14 528158
## + year 1 3.7867e+12 8.9131e+14 528301
## + condition 1 3.5280e+12 8.9157e+14 528308
## + month 11 4.2023e+12 8.9090e+14 528311
## + sqft_lot15 1 2.1877e+12 8.9291e+14 528340
## + sqft_above 1 1.1628e+12 8.9394e+14 528365
## + sqft_basement 1 1.1628e+12 8.9394e+14 528365
## + floors 1 1.0460e+12 8.9405e+14 528368
## + sqft_lot 1 7.2300e+11 8.9438e+14 528376
## + sqft_living15 1 4.3262e+11 8.9467e+14 528383
## + yr_renovated 1 3.8775e+11 8.9471e+14 528384
## <none> 8.9510e+14 528391
##
## Step: AIC=528157.6
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long
##
## Df Sum of Sq RSS AIC
## + year 1 3.8111e+12 8.8159e+14 528066
## + month 11 4.2814e+12 8.8112e+14 528075
## + condition 1 3.2700e+12 8.8213e+14 528080
## + sqft_above 1 2.8185e+12 8.8259e+14 528091
## + sqft_basement 1 2.8185e+12 8.8259e+14 528091
## + sqft_living15 1 1.5701e+12 8.8383e+14 528121
## + floors 1 8.8103e+11 8.8452e+14 528138
## + sqft_lot15 1 7.8011e+11 8.8462e+14 528141
## + yr_renovated 1 5.1267e+11 8.8489e+14 528147
## <none> 8.8540e+14 528158
## + sqft_lot 1 8.0929e+10 8.8532e+14 528158
##
## Step: AIC=528066.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year
##
## Df Sum of Sq RSS AIC
## + condition 1 3.6274e+12 8.7797e+14 527979
## + sqft_above 1 2.8009e+12 8.7879e+14 528000
## + sqft_basement 1 2.8009e+12 8.7879e+14 528000
## + sqft_living15 1 1.5610e+12 8.8003e+14 528030
## + month 11 2.0269e+12 8.7957e+14 528039
## + floors 1 9.2843e+11 8.8067e+14 528046
## + sqft_lot15 1 7.8710e+11 8.8081e+14 528049
## + yr_renovated 1 5.6324e+11 8.8103e+14 528055
## + sqft_lot 1 8.9944e+10 8.8150e+14 528066
## <none> 8.8159e+14 528066
##
## Step: AIC=527979.3
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition
##
## Df Sum of Sq RSS AIC
## + sqft_above 1 3.7222e+12 8.7424e+14 527889
## + sqft_basement 1 3.7222e+12 8.7424e+14 527889
## + sqft_living15 1 1.7359e+12 8.7623e+14 527938
## + floors 1 1.4085e+12 8.7656e+14 527947
## + yr_renovated 1 1.2248e+12 8.7674e+14 527951
## + month 11 2.0314e+12 8.7593e+14 527951
## + sqft_lot15 1 8.0771e+11 8.7716e+14 527961
## + sqft_lot 1 8.1878e+10 8.7788e+14 527979
## <none> 8.7797e+14 527979
##
## Step: AIC=527889.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above
##
## Df Sum of Sq RSS AIC
## + sqft_living15 1 1.2598e+12 8.7298e+14 527860
## + month 11 2.0638e+12 8.7218e+14 527860
## + yr_renovated 1 1.1838e+12 8.7306e+14 527862
## + sqft_lot15 1 9.0238e+11 8.7334e+14 527869
## + floors 1 1.5463e+11 8.7409e+14 527888
## + sqft_lot 1 1.1947e+11 8.7412e+14 527888
## <none> 8.7424e+14 527889
##
## Step: AIC=527860.3
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15
##
## Df Sum of Sq RSS AIC
## + yr_renovated 1 1.3372e+12 8.7165e+14 527829
## + month 11 2.0277e+12 8.7096e+14 527832
## + sqft_lot15 1 9.3286e+11 8.7205e+14 527839
## + floors 1 2.7522e+11 8.7271e+14 527855
## + sqft_lot 1 9.3474e+10 8.7289e+14 527860
## <none> 8.7298e+14 527860
##
## Step: AIC=527829.2
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15 + yr_renovated
##
## Df Sum of Sq RSS AIC
## + month 11 2.0209e+12 8.6963e+14 527801
## + sqft_lot15 1 9.3924e+11 8.7071e+14 527808
## + floors 1 2.1579e+11 8.7143e+14 527826
## + sqft_lot 1 8.7417e+10 8.7156e+14 527829
## <none> 8.7165e+14 527829
##
## Step: AIC=527801
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15 + yr_renovated + month
##
## Df Sum of Sq RSS AIC
## + sqft_lot15 1 9.7978e+11 8.6865e+14 527779
## + floors 1 1.9899e+11 8.6943e+14 527798
## + sqft_lot 1 8.6862e+10 8.6954e+14 527801
## <none> 8.6963e+14 527801
##
## Step: AIC=527778.6
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15
##
## Df Sum of Sq RSS AIC
## + sqft_lot 1 2.8792e+11 8.6836e+14 527773
## + floors 1 1.4315e+11 8.6850e+14 527777
## <none> 8.6865e+14 527779
##
## Step: AIC=527773.4
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15 +
## sqft_lot
##
## Df Sum of Sq RSS AIC
## + floors 1 1.5202e+11 8.6821e+14 527772
## <none> 8.6836e+14 527773
##
## Step: AIC=527771.7
## price ~ sqft_living + lat + view + grade + yr_built + waterfront +
## bedrooms + bathrooms + zipcode + long + year + condition +
## sqft_above + sqft_living15 + yr_renovated + month + sqft_lot15 +
## sqft_lot + floors
##
## Df Sum of Sq RSS AIC
## <none> 8.6821e+14 527772
##
## Call:
## lm(formula = price ~ sqft_living + lat + view + grade + yr_built +
## waterfront + bedrooms + bathrooms + zipcode + long + year +
## condition + sqft_above + sqft_living15 + yr_renovated + month +
## sqft_lot15 + sqft_lot + floors, data = df)
##
## Coefficients:
## (Intercept) sqft_living lat view grade
## 6.590e+06 1.503e+02 6.050e+05 5.267e+04 9.606e+04
## yr_built waterfront bedrooms bathrooms zipcode
## -2.614e+03 5.833e+05 -3.593e+04 4.130e+04 -5.842e+02
## long year2015 condition sqft_above sqft_living15
## -2.153e+05 6.003e+04 2.771e+04 3.131e+01 2.149e+01
## yr_renovated month02 month03 month04 month05
## 2.076e+01 9.277e+03 3.508e+04 3.662e+04 5.431e+04
## month06 month07 month08 month09 month10
## 6.139e+04 5.739e+04 5.974e+04 5.404e+04 6.151e+04
## month11 month12 sqft_lot15 sqft_lot floors
## 5.722e+04 5.190e+04 -3.905e-01 1.298e-01 6.969e+03
sqft_lot is an important predictor for the price of a home, but can you explain why lat (East-West) is such a high predictor for the model? Conversely, why do you think long is such a low predictor?
Answer: As we can see that the AIC value of lat(535838) is much lower then long which is why it is being taken as a high predictor. As soon as sqft_living + lat become a predictor in full model the long value goes very low.
My top 10 predictors for the linear mode are (sqft_living + lat + view + grade + yr_built + waterfront + bedrooms + bathrooms + zipcode + long) The R^2 for this model is 0.7051 which is not a good R^2 value but still better than the intial model. It’s telling that the predictors for this model are not efficient or relevant to predict the outcome price.
lm1<-lm(price~ (sqft_living:sqft_living15 + lat:long + view + grade + yr_built + waterfront + bedrooms + bathrooms + zipcode+year ),data=df)
summary(lm1)
##
## Call:
## lm(formula = price ~ (sqft_living:sqft_living15 + lat:long +
## view + grade + yr_built + waterfront + bedrooms + bathrooms +
## zipcode + year), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1643523 -96600 -10003 75976 4161373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.138e+07 2.813e+06 11.157 <2e-16 ***
## view 4.567e+04 2.071e+03 22.053 <2e-16 ***
## grade 9.641e+04 2.005e+03 48.083 <2e-16 ***
## yr_built -2.943e+03 6.098e+01 -48.265 <2e-16 ***
## waterfront 6.039e+05 1.714e+04 35.230 <2e-16 ***
## bedrooms -1.820e+04 1.771e+03 -10.272 <2e-16 ***
## bathrooms 7.358e+04 2.861e+03 25.717 <2e-16 ***
## zipcode -5.516e+02 2.987e+01 -18.466 <2e-16 ***
## year2015 2.795e+04 2.904e+03 9.624 <2e-16 ***
## sqft_living:sqft_living15 4.089e-02 6.361e-04 64.291 <2e-16 ***
## lat:long -4.820e+03 8.249e+01 -58.429 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199400 on 21602 degrees of freedom
## Multiple R-squared: 0.7051, Adjusted R-squared: 0.705
## F-statistic: 5165 on 10 and 21602 DF, p-value: < 2.2e-16
Convert the variable zipcode from numeric to a factor variable.The value of R^2 changed and the predictiors changed as well when the zipcode is converted into factor.
df$zipcode<-as.factor(df$zipcode)
Running model again to find top 10 predictors
null_model<-lm(price~1,data=df)
summary(null_model)
##
## Call:
## lm(formula = price ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465088 -218138 -90088 104912 7159912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 540088 2497 216.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
##
## Call:
## lm(formula = price ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1164529 -69483 20 61316 4412767
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.523e+07 6.123e+06 -4.122 3.78e-05 ***
## bedrooms -2.692e+04 1.527e+03 -17.626 < 2e-16 ***
## bathrooms 2.349e+04 2.617e+03 8.975 < 2e-16 ***
## sqft_living 1.300e+02 3.536e+00 36.771 < 2e-16 ***
## sqft_lot 2.450e-01 3.833e-02 6.392 1.67e-10 ***
## floors -4.532e+04 3.152e+03 -14.376 < 2e-16 ***
## waterfront 6.575e+05 1.410e+04 46.638 < 2e-16 ***
## view 5.520e+04 1.750e+03 31.537 < 2e-16 ***
## condition 2.671e+04 1.925e+03 13.874 < 2e-16 ***
## grade 5.706e+04 1.811e+03 31.502 < 2e-16 ***
## sqft_above 7.929e+01 3.625e+00 21.871 < 2e-16 ***
## sqft_basement NA NA NA NA
## yr_built -6.932e+02 6.477e+01 -10.703 < 2e-16 ***
## yr_renovated 1.842e+01 2.940e+00 6.267 3.75e-10 ***
## zipcode98002 3.258e+04 1.443e+04 2.258 0.023957 *
## zipcode98003 -2.307e+04 1.290e+04 -1.789 0.073654 .
## zipcode98004 7.170e+05 2.344e+04 30.595 < 2e-16 ***
## zipcode98005 2.479e+05 2.505e+04 9.896 < 2e-16 ***
## zipcode98006 2.280e+05 2.048e+04 11.133 < 2e-16 ***
## zipcode98007 1.947e+05 2.585e+04 7.532 5.21e-14 ***
## zipcode98008 2.048e+05 2.455e+04 8.342 < 2e-16 ***
## zipcode98010 9.673e+04 2.200e+04 4.397 1.10e-05 ***
## zipcode98011 3.566e+04 3.194e+04 1.116 0.264294
## zipcode98014 7.903e+04 3.509e+04 2.252 0.024327 *
## zipcode98019 3.806e+04 3.460e+04 1.100 0.271418
## zipcode98022 4.270e+04 1.910e+04 2.235 0.025408 *
## zipcode98023 -4.615e+04 1.186e+04 -3.890 0.000100 ***
## zipcode98024 1.493e+05 3.080e+04 4.846 1.27e-06 ***
## zipcode98027 1.516e+05 2.102e+04 7.214 5.61e-13 ***
## zipcode98028 2.760e+04 3.102e+04 0.890 0.373560
## zipcode98029 1.916e+05 2.401e+04 7.980 1.53e-15 ***
## zipcode98030 7.133e+02 1.418e+04 0.050 0.959881
## zipcode98031 2.944e+03 1.476e+04 0.199 0.841954
## zipcode98032 -7.690e+03 1.715e+04 -0.448 0.653852
## zipcode98033 2.929e+05 2.661e+04 11.007 < 2e-16 ***
## zipcode98034 1.222e+05 2.853e+04 4.284 1.84e-05 ***
## zipcode98038 4.671e+04 1.592e+04 2.934 0.003352 **
## zipcode98039 1.252e+06 3.168e+04 39.524 < 2e-16 ***
## zipcode98040 4.591e+05 2.073e+04 22.148 < 2e-16 ***
## zipcode98042 1.009e+04 1.357e+04 0.744 0.457092
## zipcode98045 1.246e+05 2.941e+04 4.236 2.29e-05 ***
## zipcode98052 1.668e+05 2.717e+04 6.140 8.38e-10 ***
## zipcode98053 1.430e+05 2.910e+04 4.914 9.01e-07 ***
## zipcode98055 2.237e+04 1.645e+04 1.360 0.173967
## zipcode98056 6.289e+04 1.787e+04 3.520 0.000433 ***
## zipcode98058 1.305e+04 1.554e+04 0.840 0.400776
## zipcode98059 5.950e+04 1.753e+04 3.394 0.000689 ***
## zipcode98065 8.636e+04 2.711e+04 3.186 0.001446 **
## zipcode98070 -6.644e+04 2.068e+04 -3.212 0.001319 **
## zipcode98072 7.191e+04 3.177e+04 2.264 0.023611 *
## zipcode98074 1.323e+05 2.573e+04 5.144 2.71e-07 ***
## zipcode98075 1.338e+05 2.474e+04 5.411 6.35e-08 ***
## zipcode98077 4.946e+04 3.306e+04 1.496 0.134611
## zipcode98092 -2.484e+04 1.290e+04 -1.926 0.054079 .
## zipcode98102 4.432e+05 2.743e+04 16.159 < 2e-16 ***
## zipcode98103 2.565e+05 2.574e+04 9.966 < 2e-16 ***
## zipcode98105 3.920e+05 2.643e+04 14.834 < 2e-16 ***
## zipcode98106 9.073e+04 1.907e+04 4.758 1.96e-06 ***
## zipcode98107 2.614e+05 2.653e+04 9.851 < 2e-16 ***
## zipcode98108 7.564e+04 2.105e+04 3.593 0.000328 ***
## zipcode98109 4.197e+05 2.734e+04 15.352 < 2e-16 ***
## zipcode98112 5.511e+05 2.426e+04 22.721 < 2e-16 ***
## zipcode98115 2.495e+05 2.616e+04 9.537 < 2e-16 ***
## zipcode98116 2.200e+05 2.129e+04 10.334 < 2e-16 ***
## zipcode98117 2.275e+05 2.649e+04 8.589 < 2e-16 ***
## zipcode98118 1.232e+05 1.859e+04 6.628 3.48e-11 ***
## zipcode98119 3.983e+05 2.582e+04 15.427 < 2e-16 ***
## zipcode98122 2.749e+05 2.303e+04 11.935 < 2e-16 ***
## zipcode98125 1.127e+05 2.825e+04 3.990 6.63e-05 ***
## zipcode98126 1.349e+05 1.955e+04 6.901 5.30e-12 ***
## zipcode98133 6.862e+04 2.917e+04 2.353 0.018649 *
## zipcode98136 1.845e+05 2.004e+04 9.206 < 2e-16 ***
## zipcode98144 2.191e+05 2.141e+04 10.234 < 2e-16 ***
## zipcode98146 5.762e+04 1.789e+04 3.220 0.001284 **
## zipcode98148 3.645e+04 2.435e+04 1.497 0.134445
## zipcode98155 4.880e+04 3.034e+04 1.609 0.107714
## zipcode98166 1.441e+04 1.638e+04 0.880 0.379120
## zipcode98168 3.908e+04 1.731e+04 2.258 0.023942 *
## zipcode98177 1.142e+05 3.045e+04 3.750 0.000177 ***
## zipcode98178 5.744e+03 1.788e+04 0.321 0.748000
## zipcode98188 6.447e+03 1.835e+04 0.351 0.725335
## zipcode98198 -2.370e+04 1.390e+04 -1.704 0.088310 .
## zipcode98199 2.995e+05 2.515e+04 11.908 < 2e-16 ***
## lat 2.138e+05 6.321e+04 3.382 0.000721 ***
## long -1.301e+05 4.540e+04 -2.865 0.004173 **
## sqft_living15 1.033e+01 2.881e+00 3.586 0.000336 ***
## sqft_lot15 -1.327e-01 6.033e-02 -2.199 0.027874 *
## year2015 6.205e+04 7.391e+03 8.395 < 2e-16 ***
## month02 6.357e+03 6.856e+03 0.927 0.353807
## month03 2.675e+04 6.332e+03 4.224 2.41e-05 ***
## month04 3.252e+04 6.159e+03 5.280 1.30e-07 ***
## month05 4.865e+04 8.145e+03 5.972 2.38e-09 ***
## month06 6.062e+04 9.633e+03 6.293 3.17e-10 ***
## month07 5.426e+04 9.625e+03 5.637 1.75e-08 ***
## month08 5.784e+04 9.710e+03 5.957 2.61e-09 ***
## month09 5.176e+04 9.769e+03 5.299 1.18e-07 ***
## month10 5.443e+04 9.734e+03 5.592 2.28e-08 ***
## month11 5.545e+04 9.965e+03 5.564 2.66e-08 ***
## month12 5.673e+04 9.925e+03 5.716 1.10e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160200 on 21515 degrees of freedom
## Multiple R-squared: 0.8103, Adjusted R-squared: 0.8095
## F-statistic: 947.7 on 97 and 21515 DF, p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start: AIC=553875.8
## price ~ 1
##
## Df Sum of Sq RSS AIC
## + sqft_living 1 1.4356e+15 1.4773e+15 539204
## + grade 1 1.2976e+15 1.6153e+15 541134
## + zipcode 69 1.1867e+15 1.7262e+15 542706
## + sqft_above 1 1.0682e+15 1.8447e+15 544004
## + sqft_living15 1 9.9816e+14 1.9148e+15 544810
## + bathrooms 1 8.0329e+14 2.1096e+15 546904
## + view 1 4.5978e+14 2.4531e+15 550165
## + sqft_basement 1 3.0544e+14 2.6075e+15 551484
## + bedrooms 1 2.7696e+14 2.6360e+15 551718
## + lat 1 2.7455e+14 2.6384e+15 551738
## + waterfront 1 2.0668e+14 2.7062e+15 552287
## + floors 1 1.9209e+14 2.7208e+15 552403
## + yr_renovated 1 4.6564e+13 2.8664e+15 553529
## + sqft_lot 1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15 1 1.9801e+13 2.8931e+15 553730
## + yr_built 1 8.4977e+12 2.9044e+15 553815
## + condition 1 3.8514e+12 2.9091e+15 553849
## + month 11 4.6632e+12 2.9083e+15 553863
## + long 1 1.3624e+12 2.9116e+15 553868
## <none> 2.9129e+15 553876
## + year 1 3.7251e+10 2.9129e+15 553878
##
## Step: AIC=539203.5
## price ~ sqft_living
##
## Df Sum of Sq RSS AIC
## + zipcode 69 6.9104e+14 7.8624e+14 525710
## + lat 1 2.1314e+14 1.2641e+15 535838
## + view 1 1.2362e+14 1.3537e+15 537317
## + grade 1 1.2132e+14 1.3560e+15 537353
## + waterfront 1 1.1024e+14 1.3670e+15 537529
## + yr_built 1 9.2854e+13 1.3844e+15 537802
## + long 1 6.6817e+13 1.4105e+15 538205
## + bedrooms 1 4.0635e+13 1.4366e+15 538603
## + yr_renovated 1 2.2405e+13 1.4549e+15 538875
## + sqft_living15 1 2.0109e+13 1.4572e+15 538909
## + condition 1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15 1 6.4407e+12 1.4708e+15 539111
## + sqft_lot 1 3.0113e+12 1.4743e+15 539161
## + month 11 4.0868e+12 1.4732e+15 539166
## + year 1 1.6739e+12 1.4756e+15 539181
## + sqft_above 1 1.2165e+12 1.4761e+15 539188
## + sqft_basement 1 1.2165e+12 1.4761e+15 539188
## + floors 1 2.2991e+11 1.4770e+15 539202
## + bathrooms 1 1.4719e+11 1.4771e+15 539203
## <none> 1.4773e+15 539204
##
## Step: AIC=525710.2
## price ~ sqft_living + zipcode
##
## Df Sum of Sq RSS AIC
## + waterfront 1 1.1577e+14 6.7047e+14 522270
## + view 1 9.4436e+13 6.9180e+14 522947
## + grade 1 4.2251e+13 7.4398e+14 524518
## + bedrooms 1 2.4026e+13 7.6221e+14 525041
## + sqft_living15 1 1.6595e+13 7.6964e+14 525251
## + sqft_above 1 8.9744e+12 7.7726e+14 525464
## + sqft_basement 1 8.9744e+12 7.7726e+14 525464
## + yr_renovated 1 4.6850e+12 7.8155e+14 525583
## + year 1 3.9193e+12 7.8232e+14 525604
## + condition 1 3.8480e+12 7.8239e+14 525606
## + yr_built 1 3.6430e+12 7.8259e+14 525612
## + month 11 3.9947e+12 7.8224e+14 525622
## + sqft_lot 1 2.9632e+12 7.8327e+14 525631
## + sqft_lot15 1 1.2435e+12 7.8499e+14 525678
## + long 1 7.0269e+11 7.8553e+14 525693
## + floors 1 5.5501e+11 7.8568e+14 525697
## + lat 1 1.1386e+11 7.8612e+14 525709
## <none> 7.8624e+14 525710
## + bathrooms 1 9.9318e+09 7.8623e+14 525712
##
## Step: AIC=522269.7
## price ~ sqft_living + zipcode + waterfront
##
## Df Sum of Sq RSS AIC
## + grade 1 3.9096e+13 6.3137e+14 520973
## + view 1 3.7180e+13 6.3329e+14 521039
## + bedrooms 1 1.6264e+13 6.5420e+14 521741
## + sqft_living15 1 1.3958e+13 6.5651e+14 521817
## + sqft_above 1 1.0767e+13 6.5970e+14 521922
## + sqft_basement 1 1.0767e+13 6.5970e+14 521922
## + year 1 3.8981e+12 6.6657e+14 522146
## + sqft_lot 1 3.5114e+12 6.6696e+14 522158
## + condition 1 3.3423e+12 6.6713e+14 522164
## + month 11 3.9016e+12 6.6657e+14 522166
## + yr_built 1 1.9059e+12 6.6856e+14 522210
## + yr_renovated 1 1.6654e+12 6.6880e+14 522218
## + sqft_lot15 1 1.5408e+12 6.6893e+14 522222
## + floors 1 6.3045e+11 6.6984e+14 522251
## + lat 1 4.0376e+11 6.7006e+14 522259
## + long 1 2.3133e+11 6.7024e+14 522264
## <none> 6.7047e+14 522270
## + bathrooms 1 3.8057e+06 6.7047e+14 522272
##
## Step: AIC=520973.2
## price ~ sqft_living + zipcode + waterfront + grade
##
## Df Sum of Sq RSS AIC
## + view 1 3.1697e+13 5.9967e+14 519862
## + yr_built 1 1.8348e+13 6.1302e+14 520338
## + bedrooms 1 9.8149e+12 6.2156e+14 520637
## + condition 1 8.7967e+12 6.2257e+14 520672
## + floors 1 7.9792e+12 6.2339e+14 520700
## + sqft_living15 1 4.2135e+12 6.2716e+14 520830
## + year 1 3.9998e+12 6.2737e+14 520838
## + sqft_lot 1 3.5158e+12 6.2786e+14 520854
## + month 11 3.7544e+12 6.2762e+14 520866
## + yr_renovated 1 2.5487e+12 6.2882e+14 520888
## + sqft_above 1 2.1336e+12 6.2924e+14 520902
## + sqft_basement 1 2.1336e+12 6.2924e+14 520902
## + sqft_lot15 1 1.6165e+12 6.2976e+14 520920
## + bathrooms 1 1.6091e+12 6.2976e+14 520920
## + lat 1 2.7263e+11 6.3110e+14 520966
## + long 1 1.4112e+11 6.3123e+14 520970
## <none> 6.3137e+14 520973
##
## Step: AIC=519861.9
## price ~ sqft_living + zipcode + waterfront + grade + view
##
## Df Sum of Sq RSS AIC
## + yr_built 1 1.3437e+13 5.8624e+14 519374
## + bedrooms 1 7.6258e+12 5.9205e+14 519587
## + condition 1 7.1374e+12 5.9254e+14 519605
## + sqft_above 1 5.4833e+12 5.9419e+14 519665
## + sqft_basement 1 5.4833e+12 5.9419e+14 519665
## + floors 1 5.2288e+12 5.9445e+14 519675
## + year 1 3.7044e+12 5.9597e+14 519730
## + month 11 3.5595e+12 5.9611e+14 519755
## + sqft_lot 1 2.6659e+12 5.9701e+14 519768
## + yr_renovated 1 1.8495e+12 5.9782e+14 519797
## + sqft_living15 1 1.2617e+12 5.9841e+14 519818
## + bathrooms 1 1.1539e+12 5.9852e+14 519822
## + sqft_lot15 1 1.1022e+12 5.9857e+14 519824
## + lat 1 3.4922e+11 5.9933e+14 519851
## + long 1 9.9357e+10 5.9957e+14 519860
## <none> 5.9967e+14 519862
##
## Step: AIC=519374.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built
##
## Df Sum of Sq RSS AIC
## + bedrooms 1 7.7652e+12 5.7847e+14 519088
## + sqft_above 1 6.8761e+12 5.7936e+14 519121
## + sqft_basement 1 6.8761e+12 5.7936e+14 519121
## + year 1 3.8284e+12 5.8241e+14 519235
## + month 11 3.6431e+12 5.8259e+14 519261
## + condition 1 2.4804e+12 5.8376e+14 519285
## + sqft_lot 1 1.7107e+12 5.8453e+14 519313
## + sqft_living15 1 7.2759e+11 5.8551e+14 519349
## + floors 1 5.9549e+11 5.8564e+14 519354
## + sqft_lot15 1 5.5026e+11 5.8569e+14 519356
## + lat 1 3.2692e+11 5.8591e+14 519364
## + yr_renovated 1 3.1372e+11 5.8592e+14 519365
## + bathrooms 1 2.1204e+11 5.8603e+14 519368
## + long 1 7.6113e+10 5.8616e+14 519373
## <none> 5.8624e+14 519374
##
## Step: AIC=519088
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms
##
## Df Sum of Sq RSS AIC
## + sqft_above 1 6.3370e+12 5.7214e+14 518852
## + sqft_basement 1 6.3370e+12 5.7214e+14 518852
## + year 1 3.8895e+12 5.7458e+14 518944
## + month 11 3.7727e+12 5.7470e+14 518969
## + condition 1 2.9190e+12 5.7555e+14 518981
## + sqft_lot 1 1.2696e+12 5.7720e+14 519042
## + bathrooms 1 1.1447e+12 5.7733e+14 519047
## + sqft_living15 1 5.6359e+11 5.7791e+14 519069
## + floors 1 4.7047e+11 5.7800e+14 519072
## + lat 1 2.9843e+11 5.7817e+14 519079
## + yr_renovated 1 2.8851e+11 5.7818e+14 519079
## + sqft_lot15 1 2.8621e+11 5.7819e+14 519079
## + long 1 1.0575e+11 5.7837e+14 519086
## <none> 5.7847e+14 519088
##
## Step: AIC=518851.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above
##
## Df Sum of Sq RSS AIC
## + floors 1 4.4329e+12 5.6770e+14 518686
## + condition 1 4.2128e+12 5.6792e+14 518694
## + year 1 3.8676e+12 5.6827e+14 518707
## + month 11 3.8227e+12 5.6831e+14 518729
## + bathrooms 1 1.3994e+12 5.7074e+14 518801
## + sqft_lot 1 1.1648e+12 5.7097e+14 518810
## + lat 1 3.4475e+11 5.7179e+14 518841
## + sqft_lot15 1 2.5962e+11 5.7188e+14 518844
## + yr_renovated 1 2.3738e+11 5.7190e+14 518845
## + sqft_living15 1 2.3714e+11 5.7190e+14 518845
## + long 1 1.4851e+11 5.7199e+14 518848
## <none> 5.7214e+14 518852
##
## Step: AIC=518685.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors
##
## Df Sum of Sq RSS AIC
## + condition 1 4.0092e+12 5.6369e+14 518535
## + year 1 3.7412e+12 5.6396e+14 518545
## + month 11 3.7135e+12 5.6399e+14 518566
## + bathrooms 1 2.7803e+12 5.6492e+14 518582
## + sqft_lot 1 9.4701e+11 5.6676e+14 518652
## + yr_renovated 1 5.4811e+11 5.6715e+14 518667
## + lat 1 2.7934e+11 5.6742e+14 518677
## + long 1 1.8903e+11 5.6751e+14 518681
## + sqft_lot15 1 1.4587e+11 5.6756e+14 518682
## + sqft_living15 1 1.2035e+11 5.6758e+14 518683
## <none> 5.6770e+14 518686
##
## Step: AIC=518534.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition
##
## Df Sum of Sq RSS AIC
## + year 1 4.1601e+12 5.5953e+14 518376
## + month 11 4.0530e+12 5.5964e+14 518401
## + bathrooms 1 2.4411e+12 5.6125e+14 518443
## + yr_renovated 1 1.2094e+12 5.6248e+14 518490
## + sqft_lot 1 9.9034e+11 5.6270e+14 518499
## + lat 1 3.5099e+11 5.6334e+14 518523
## + sqft_living15 1 2.0048e+11 5.6349e+14 518529
## + long 1 1.8946e+11 5.6350e+14 518529
## + sqft_lot15 1 1.4721e+11 5.6355e+14 518531
## <none> 5.6369e+14 518535
##
## Step: AIC=518376.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year
##
## Df Sum of Sq RSS AIC
## + bathrooms 1 2.4807e+12 5.5705e+14 518282
## + yr_renovated 1 1.3452e+12 5.5819e+14 518326
## + month 11 1.6499e+12 5.5788e+14 518335
## + sqft_lot 1 9.4997e+11 5.5858e+14 518342
## + lat 1 3.2476e+11 5.5921e+14 518366
## + sqft_living15 1 2.0954e+11 5.5932e+14 518370
## + long 1 1.9735e+11 5.5934e+14 518371
## + sqft_lot15 1 1.3936e+11 5.5939e+14 518373
## <none> 5.5953e+14 518376
##
## Step: AIC=518282.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms
##
## Df Sum of Sq RSS AIC
## + month 11 1.6756e+12 5.5538e+14 518239
## + sqft_lot 1 9.7066e+11 5.5608e+14 518247
## + yr_renovated 1 8.9857e+11 5.5615e+14 518250
## + lat 1 3.2531e+11 5.5673e+14 518272
## + sqft_living15 1 2.5318e+11 5.5680e+14 518275
## + long 1 1.8705e+11 5.5687e+14 518277
## + sqft_lot15 1 1.6583e+11 5.5689e+14 518278
## <none> 5.5705e+14 518282
##
## Step: AIC=518239.3
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month
##
## Df Sum of Sq RSS AIC
## + sqft_lot 1 9.7162e+11 5.5441e+14 518203
## + yr_renovated 1 9.0953e+11 5.5447e+14 518206
## + lat 1 3.2891e+11 5.5505e+14 518229
## + sqft_living15 1 2.3378e+11 5.5514e+14 518232
## + long 1 1.8732e+11 5.5519e+14 518234
## + sqft_lot15 1 1.4760e+11 5.5523e+14 518236
## <none> 5.5538e+14 518239
##
## Step: AIC=518203.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot
##
## Df Sum of Sq RSS AIC
## + yr_renovated 1 9.2987e+11 5.5348e+14 518169
## + lat 1 3.4940e+11 5.5406e+14 518192
## + long 1 2.5487e+11 5.5415e+14 518196
## + sqft_living15 1 2.4884e+11 5.5416e+14 518196
## + sqft_lot15 1 1.1490e+11 5.5429e+14 518201
## <none> 5.5441e+14 518203
##
## Step: AIC=518169.2
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot + yr_renovated
##
## Df Sum of Sq RSS AIC
## + lat 1 3.3998e+11 5.5314e+14 518158
## + sqft_living15 1 3.1724e+11 5.5316e+14 518159
## + long 1 2.7104e+11 5.5320e+14 518161
## + sqft_lot15 1 1.1779e+11 5.5336e+14 518167
## <none> 5.5348e+14 518169
##
## Step: AIC=518157.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot + yr_renovated + lat
##
## Df Sum of Sq RSS AIC
## + sqft_living15 1 3.0922e+11 5.5283e+14 518148
## + long 1 2.3146e+11 5.5290e+14 518151
## + sqft_lot15 1 1.1724e+11 5.5302e+14 518155
## <none> 5.5314e+14 518158
##
## Step: AIC=518147.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot + yr_renovated + lat + sqft_living15
##
## Df Sum of Sq RSS AIC
## + long 1 2.2901e+11 5.5260e+14 518141
## + sqft_lot15 1 1.4242e+11 5.5268e+14 518144
## <none> 5.5283e+14 518148
##
## Step: AIC=518140.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot + yr_renovated + lat + sqft_living15 + long
##
## Df Sum of Sq RSS AIC
## + sqft_lot15 1 1.2419e+11 5.5247e+14 518138
## <none> 5.5260e+14 518141
##
## Step: AIC=518138
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + floors + condition + year + bathrooms +
## month + sqft_lot + yr_renovated + lat + sqft_living15 + long +
## sqft_lot15
##
## Df Sum of Sq RSS AIC
## <none> 5.5247e+14 518138
##
## Call:
## lm(formula = price ~ sqft_living + zipcode + waterfront + grade +
## view + yr_built + bedrooms + sqft_above + floors + condition +
## year + bathrooms + month + sqft_lot + yr_renovated + lat +
## sqft_living15 + long + sqft_lot15, data = df)
##
## Coefficients:
## (Intercept) sqft_living zipcode98002 zipcode98003 zipcode98004
## -2.523e+07 1.300e+02 3.258e+04 -2.307e+04 7.170e+05
## zipcode98005 zipcode98006 zipcode98007 zipcode98008 zipcode98010
## 2.479e+05 2.280e+05 1.947e+05 2.048e+05 9.673e+04
## zipcode98011 zipcode98014 zipcode98019 zipcode98022 zipcode98023
## 3.566e+04 7.903e+04 3.806e+04 4.270e+04 -4.615e+04
## zipcode98024 zipcode98027 zipcode98028 zipcode98029 zipcode98030
## 1.493e+05 1.516e+05 2.760e+04 1.916e+05 7.133e+02
## zipcode98031 zipcode98032 zipcode98033 zipcode98034 zipcode98038
## 2.944e+03 -7.690e+03 2.929e+05 1.222e+05 4.671e+04
## zipcode98039 zipcode98040 zipcode98042 zipcode98045 zipcode98052
## 1.252e+06 4.591e+05 1.009e+04 1.246e+05 1.668e+05
## zipcode98053 zipcode98055 zipcode98056 zipcode98058 zipcode98059
## 1.430e+05 2.237e+04 6.289e+04 1.305e+04 5.950e+04
## zipcode98065 zipcode98070 zipcode98072 zipcode98074 zipcode98075
## 8.636e+04 -6.644e+04 7.191e+04 1.323e+05 1.338e+05
## zipcode98077 zipcode98092 zipcode98102 zipcode98103 zipcode98105
## 4.946e+04 -2.484e+04 4.432e+05 2.565e+05 3.920e+05
## zipcode98106 zipcode98107 zipcode98108 zipcode98109 zipcode98112
## 9.073e+04 2.614e+05 7.564e+04 4.197e+05 5.511e+05
## zipcode98115 zipcode98116 zipcode98117 zipcode98118 zipcode98119
## 2.495e+05 2.200e+05 2.275e+05 1.232e+05 3.983e+05
## zipcode98122 zipcode98125 zipcode98126 zipcode98133 zipcode98136
## 2.749e+05 1.127e+05 1.349e+05 6.862e+04 1.845e+05
## zipcode98144 zipcode98146 zipcode98148 zipcode98155 zipcode98166
## 2.191e+05 5.762e+04 3.645e+04 4.880e+04 1.441e+04
## zipcode98168 zipcode98177 zipcode98178 zipcode98188 zipcode98198
## 3.908e+04 1.142e+05 5.744e+03 6.447e+03 -2.370e+04
## zipcode98199 waterfront grade view yr_built
## 2.995e+05 6.575e+05 5.706e+04 5.520e+04 -6.932e+02
## bedrooms sqft_above floors condition year2015
## -2.692e+04 7.929e+01 -4.532e+04 2.671e+04 6.205e+04
## bathrooms month02 month03 month04 month05
## 2.349e+04 6.357e+03 2.675e+04 3.252e+04 4.865e+04
## month06 month07 month08 month09 month10
## 6.062e+04 5.426e+04 5.784e+04 5.176e+04 5.443e+04
## month11 month12 sqft_lot yr_renovated lat
## 5.545e+04 5.673e+04 2.450e-01 1.842e+01 2.138e+05
## sqft_living15 long sqft_lot15
## 1.033e+01 -1.301e+05 -1.327e-01
linear regression model 2
lm2<-lm(price~ (sqft_living + zipcode + waterfront + grade + view + yr_built +
bedrooms + sqft_above + floors + condition),data=df)
summary(lm2)
##
## Call:
## lm(formula = price ~ (sqft_living + zipcode + waterfront + grade +
## view + yr_built + bedrooms + sqft_above + floors + condition),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1119069 -71685 -430 62524 4427499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.175e+05 1.140e+05 7.170 7.72e-13 ***
## sqft_living 1.471e+02 3.182e+00 46.231 < 2e-16 ***
## zipcode98002 2.692e+04 1.431e+04 1.881 0.059999 .
## zipcode98003 -1.648e+04 1.289e+04 -1.278 0.201136
## zipcode98004 7.780e+05 1.261e+04 61.718 < 2e-16 ***
## zipcode98005 3.032e+05 1.524e+04 19.900 < 2e-16 ***
## zipcode98006 2.671e+05 1.136e+04 23.510 < 2e-16 ***
## zipcode98007 2.429e+05 1.612e+04 15.067 < 2e-16 ***
## zipcode98008 2.486e+05 1.290e+04 19.267 < 2e-16 ***
## zipcode98010 7.487e+04 1.829e+04 4.093 4.28e-05 ***
## zipcode98011 1.242e+05 1.439e+04 8.630 < 2e-16 ***
## zipcode98014 1.107e+05 1.686e+04 6.564 5.35e-11 ***
## zipcode98019 9.650e+04 1.452e+04 6.647 3.07e-11 ***
## zipcode98022 -2.359e+03 1.363e+04 -0.173 0.862616
## zipcode98023 -3.485e+04 1.119e+04 -3.116 0.001836 **
## zipcode98024 1.728e+05 1.992e+04 8.672 < 2e-16 ***
## zipcode98027 1.722e+05 1.172e+04 14.693 < 2e-16 ***
## zipcode98028 1.206e+05 1.285e+04 9.386 < 2e-16 ***
## zipcode98029 2.136e+05 1.250e+04 17.096 < 2e-16 ***
## zipcode98030 7.143e+03 1.322e+04 0.540 0.588865
## zipcode98031 1.669e+04 1.296e+04 1.287 0.197954
## zipcode98032 1.370e+03 1.681e+04 0.081 0.935064
## zipcode98033 3.640e+05 1.158e+04 31.447 < 2e-16 ***
## zipcode98034 2.020e+05 1.099e+04 18.383 < 2e-16 ***
## zipcode98038 3.768e+04 1.083e+04 3.478 0.000506 ***
## zipcode98039 1.325e+06 2.462e+04 53.830 < 2e-16 ***
## zipcode98040 5.136e+05 1.308e+04 39.264 < 2e-16 ***
## zipcode98042 8.091e+03 1.097e+04 0.737 0.460934
## zipcode98045 9.646e+04 1.383e+04 6.977 3.10e-12 ***
## zipcode98052 2.273e+05 1.091e+04 20.831 < 2e-16 ***
## zipcode98053 1.943e+05 1.182e+04 16.438 < 2e-16 ***
## zipcode98055 4.754e+04 1.305e+04 3.642 0.000272 ***
## zipcode98056 9.562e+04 1.172e+04 8.161 3.49e-16 ***
## zipcode98058 3.017e+04 1.141e+04 2.645 0.008187 **
## zipcode98059 8.482e+04 1.136e+04 7.464 8.69e-14 ***
## zipcode98065 8.727e+04 1.260e+04 6.928 4.40e-12 ***
## zipcode98070 -7.681e+03 1.739e+04 -0.442 0.658794
## zipcode98072 1.530e+05 1.301e+04 11.758 < 2e-16 ***
## zipcode98074 1.705e+05 1.159e+04 14.711 < 2e-16 ***
## zipcode98075 1.660e+05 1.221e+04 13.591 < 2e-16 ***
## zipcode98077 1.212e+05 1.443e+04 8.400 < 2e-16 ***
## zipcode98092 -3.477e+04 1.214e+04 -2.865 0.004178 **
## zipcode98102 5.141e+05 1.830e+04 28.085 < 2e-16 ***
## zipcode98103 3.411e+05 1.113e+04 30.646 < 2e-16 ***
## zipcode98105 4.716e+05 1.400e+04 33.697 < 2e-16 ***
## zipcode98106 1.504e+05 1.234e+04 12.189 < 2e-16 ***
## zipcode98107 3.522e+05 1.335e+04 26.395 < 2e-16 ***
## zipcode98108 1.294e+05 1.469e+04 8.808 < 2e-16 ***
## zipcode98109 4.965e+05 1.799e+04 27.593 < 2e-16 ***
## zipcode98112 6.226e+05 1.351e+04 46.092 < 2e-16 ***
## zipcode98115 3.314e+05 1.106e+04 29.980 < 2e-16 ***
## zipcode98116 2.928e+05 1.255e+04 23.326 < 2e-16 ***
## zipcode98117 3.178e+05 1.119e+04 28.414 < 2e-16 ***
## zipcode98118 1.733e+05 1.129e+04 15.352 < 2e-16 ***
## zipcode98119 4.819e+05 1.504e+04 32.046 < 2e-16 ***
## zipcode98122 3.427e+05 1.312e+04 26.127 < 2e-16 ***
## zipcode98125 2.007e+05 1.175e+04 17.078 < 2e-16 ***
## zipcode98126 1.939e+05 1.224e+04 15.835 < 2e-16 ***
## zipcode98133 1.666e+05 1.126e+04 14.794 < 2e-16 ***
## zipcode98136 2.482e+05 1.329e+04 18.686 < 2e-16 ***
## zipcode98144 2.821e+05 1.242e+04 22.706 < 2e-16 ***
## zipcode98146 1.088e+05 1.285e+04 8.468 < 2e-16 ***
## zipcode98148 7.140e+04 2.308e+04 3.093 0.001983 **
## zipcode98155 1.468e+05 1.149e+04 12.772 < 2e-16 ***
## zipcode98166 5.472e+04 1.333e+04 4.106 4.03e-05 ***
## zipcode98168 7.563e+04 1.311e+04 5.767 8.20e-09 ***
## zipcode98177 2.172e+05 1.336e+04 16.263 < 2e-16 ***
## zipcode98178 3.823e+04 1.322e+04 2.893 0.003824 **
## zipcode98188 3.635e+04 1.629e+04 2.231 0.025680 *
## zipcode98198 -1.398e+03 1.292e+04 -0.108 0.913827
## zipcode98199 3.883e+05 1.269e+04 30.597 < 2e-16 ***
## waterfront 6.587e+05 1.418e+04 46.438 < 2e-16 ***
## grade 6.039e+04 1.767e+03 34.180 < 2e-16 ***
## view 5.721e+04 1.738e+03 32.919 < 2e-16 ***
## yr_built -6.724e+02 5.875e+01 -11.444 < 2e-16 ***
## bedrooms -2.470e+04 1.506e+03 -16.401 < 2e-16 ***
## sqft_above 7.558e+01 3.597e+00 21.014 < 2e-16 ***
## floors -3.923e+04 3.086e+03 -12.711 < 2e-16 ***
## condition 2.352e+04 1.901e+03 12.376 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 161800 on 21534 degrees of freedom
## Multiple R-squared: 0.8065, Adjusted R-squared: 0.8058
## F-statistic: 1151 on 78 and 21534 DF, p-value: < 2.2e-16
The zipcode now becomes the one of the top 10 predictors because its not numeric but a factor variable now.
The value of R^2 is 0.806 which shows that the model is somewhat efficient and can be further improved by finding out more imortant predictors than the current ones. Most of the predictors in the model are now highly siginificat
Does the model suffer from heteroskedesticity? (Use bptest in the lmtest package in R. In Python, use het_breuschpagan test in the statsmodels package)
Answer: Yes the model suffers from heteroskedesticity since the value is less than 0.05
lmtest::bptest(lm2)
##
## studentized Breusch-Pagan test
##
## data: lm2
## BP = 2884, df = 78, p-value < 2.2e-16
Are there nonlinearities in the model? (Use the plots to discern this) Answer: Yes there are nonlinerities in the model. The trend lines are horizontal rather than vertical which is an indication of non linearities.
Are the residuals normally distributed? Answer: No the residuals are not normally distributed as we can see from the plot that values are not properely distributed.
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(lm2)
Section 3 - Extra Credit
df$bedrooms<-as.factor(df$bedrooms)
null_model<-lm(price~1,data=df)
summary(null_model)
##
## Call:
## lm(formula = price ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -465088 -218138 -90088 104912 7159912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 540088 2497 216.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 367100 on 21612 degrees of freedom
full_model<-lm(price~.,data=df)
summary(full_model)
##
## Call:
## lm(formula = price ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1166293 -69414 -433 61664 4377522
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.517e+07 6.115e+06 -4.116 3.86e-05 ***
## bedrooms1 -3.878e+04 4.606e+04 -0.842 0.399823
## bedrooms2 -8.181e+04 4.473e+04 -1.829 0.067438 .
## bedrooms3 -1.132e+05 4.468e+04 -2.533 0.011327 *
## bedrooms4 -1.499e+05 4.473e+04 -3.352 0.000803 ***
## bedrooms5 -1.608e+05 4.493e+04 -3.578 0.000347 ***
## bedrooms6 -1.756e+05 4.588e+04 -3.827 0.000130 ***
## bedrooms7 -2.787e+05 5.199e+04 -5.360 8.39e-08 ***
## bedrooms8 -1.483e+05 6.332e+04 -2.342 0.019196 *
## bedrooms9 -3.662e+05 7.978e+04 -4.590 4.46e-06 ***
## bedrooms10 -4.744e+05 1.029e+05 -4.610 4.05e-06 ***
## bedrooms11 -2.723e+05 1.666e+05 -1.635 0.102116
## bedrooms33 -6.439e+04 1.663e+05 -0.387 0.698706
## bathrooms 2.424e+04 2.629e+03 9.220 < 2e-16 ***
## sqft_living 1.305e+02 3.558e+00 36.673 < 2e-16 ***
## sqft_lot 2.422e-01 3.829e-02 6.325 2.58e-10 ***
## floors -4.471e+04 3.153e+03 -14.181 < 2e-16 ***
## waterfront 6.547e+05 1.408e+04 46.488 < 2e-16 ***
## view 5.477e+04 1.750e+03 31.307 < 2e-16 ***
## condition 2.702e+04 1.928e+03 14.009 < 2e-16 ***
## grade 5.746e+04 1.823e+03 31.515 < 2e-16 ***
## sqft_above 7.994e+01 3.625e+00 22.053 < 2e-16 ***
## sqft_basement NA NA NA NA
## yr_built -7.051e+02 6.488e+01 -10.869 < 2e-16 ***
## yr_renovated 1.839e+01 2.938e+00 6.260 3.93e-10 ***
## zipcode98002 3.135e+04 1.441e+04 2.175 0.029607 *
## zipcode98003 -2.400e+04 1.288e+04 -1.863 0.062409 .
## zipcode98004 7.159e+05 2.342e+04 30.573 < 2e-16 ***
## zipcode98005 2.477e+05 2.502e+04 9.899 < 2e-16 ***
## zipcode98006 2.278e+05 2.046e+04 11.133 < 2e-16 ***
## zipcode98007 1.950e+05 2.583e+04 7.549 4.57e-14 ***
## zipcode98008 2.055e+05 2.453e+04 8.379 < 2e-16 ***
## zipcode98010 9.688e+04 2.197e+04 4.410 1.04e-05 ***
## zipcode98011 3.540e+04 3.191e+04 1.110 0.267219
## zipcode98014 7.830e+04 3.505e+04 2.234 0.025481 *
## zipcode98019 3.875e+04 3.456e+04 1.121 0.262272
## zipcode98022 4.171e+04 1.908e+04 2.186 0.028796 *
## zipcode98023 -4.618e+04 1.185e+04 -3.898 9.71e-05 ***
## zipcode98024 1.483e+05 3.078e+04 4.819 1.46e-06 ***
## zipcode98027 1.512e+05 2.100e+04 7.199 6.26e-13 ***
## zipcode98028 2.876e+04 3.098e+04 0.928 0.353243
## zipcode98029 1.921e+05 2.398e+04 8.010 1.21e-15 ***
## zipcode98030 1.095e+03 1.416e+04 0.077 0.938348
## zipcode98031 3.575e+03 1.474e+04 0.242 0.808397
## zipcode98032 -7.741e+03 1.712e+04 -0.452 0.651253
## zipcode98033 2.933e+05 2.658e+04 11.035 < 2e-16 ***
## zipcode98034 1.230e+05 2.850e+04 4.317 1.59e-05 ***
## zipcode98038 4.720e+04 1.590e+04 2.969 0.002994 **
## zipcode98039 1.251e+06 3.164e+04 39.522 < 2e-16 ***
## zipcode98040 4.576e+05 2.071e+04 22.094 < 2e-16 ***
## zipcode98042 1.001e+04 1.355e+04 0.739 0.460060
## zipcode98045 1.249e+05 2.938e+04 4.251 2.14e-05 ***
## zipcode98052 1.673e+05 2.713e+04 6.166 7.13e-10 ***
## zipcode98053 1.425e+05 2.907e+04 4.904 9.48e-07 ***
## zipcode98055 2.101e+04 1.643e+04 1.279 0.200851
## zipcode98056 6.305e+04 1.785e+04 3.532 0.000413 ***
## zipcode98058 1.335e+04 1.552e+04 0.861 0.389516
## zipcode98059 5.996e+04 1.751e+04 3.425 0.000616 ***
## zipcode98065 8.599e+04 2.708e+04 3.175 0.001499 **
## zipcode98070 -6.859e+04 2.066e+04 -3.319 0.000904 ***
## zipcode98072 7.303e+04 3.173e+04 2.301 0.021380 *
## zipcode98074 1.328e+05 2.570e+04 5.169 2.37e-07 ***
## zipcode98075 1.350e+05 2.471e+04 5.463 4.75e-08 ***
## zipcode98077 5.075e+04 3.302e+04 1.537 0.124307
## zipcode98092 -2.432e+04 1.288e+04 -1.889 0.058939 .
## zipcode98102 4.413e+05 2.741e+04 16.099 < 2e-16 ***
## zipcode98103 2.532e+05 2.571e+04 9.847 < 2e-16 ***
## zipcode98105 3.914e+05 2.641e+04 14.820 < 2e-16 ***
## zipcode98106 8.882e+04 1.905e+04 4.662 3.16e-06 ***
## zipcode98107 2.586e+05 2.651e+04 9.754 < 2e-16 ***
## zipcode98108 7.375e+04 2.103e+04 3.506 0.000455 ***
## zipcode98109 4.182e+05 2.731e+04 15.315 < 2e-16 ***
## zipcode98112 5.478e+05 2.423e+04 22.603 < 2e-16 ***
## zipcode98115 2.491e+05 2.613e+04 9.534 < 2e-16 ***
## zipcode98116 2.188e+05 2.126e+04 10.290 < 2e-16 ***
## zipcode98117 2.264e+05 2.646e+04 8.554 < 2e-16 ***
## zipcode98118 1.212e+05 1.857e+04 6.523 7.04e-11 ***
## zipcode98119 3.960e+05 2.579e+04 15.350 < 2e-16 ***
## zipcode98122 2.724e+05 2.301e+04 11.837 < 2e-16 ***
## zipcode98125 1.128e+05 2.822e+04 3.999 6.39e-05 ***
## zipcode98126 1.328e+05 1.954e+04 6.795 1.11e-11 ***
## zipcode98133 6.827e+04 2.913e+04 2.343 0.019120 *
## zipcode98136 1.827e+05 2.002e+04 9.127 < 2e-16 ***
## zipcode98144 2.178e+05 2.139e+04 10.184 < 2e-16 ***
## zipcode98146 5.658e+04 1.787e+04 3.166 0.001549 **
## zipcode98148 3.643e+04 2.431e+04 1.498 0.134073
## zipcode98155 4.950e+04 3.030e+04 1.634 0.102373
## zipcode98166 1.402e+04 1.636e+04 0.857 0.391311
## zipcode98168 3.790e+04 1.729e+04 2.192 0.028354 *
## zipcode98177 1.140e+05 3.041e+04 3.747 0.000179 ***
## zipcode98178 5.822e+03 1.786e+04 0.326 0.744381
## zipcode98188 6.150e+03 1.833e+04 0.336 0.737168
## zipcode98198 -2.439e+04 1.389e+04 -1.756 0.079096 .
## zipcode98199 2.988e+05 2.512e+04 11.892 < 2e-16 ***
## lat 2.102e+05 6.314e+04 3.330 0.000871 ***
## long -1.313e+05 4.535e+04 -2.896 0.003778 **
## sqft_living15 1.067e+01 2.891e+00 3.692 0.000223 ***
## sqft_lot15 -1.378e-01 6.027e-02 -2.287 0.022227 *
## year2015 6.119e+04 7.382e+03 8.289 < 2e-16 ***
## month02 5.761e+03 6.847e+03 0.841 0.400128
## month03 2.664e+04 6.323e+03 4.214 2.52e-05 ***
## month04 3.245e+04 6.151e+03 5.275 1.34e-07 ***
## month05 4.791e+04 8.136e+03 5.889 3.94e-09 ***
## month06 5.923e+04 9.622e+03 6.155 7.63e-10 ***
## month07 5.325e+04 9.613e+03 5.540 3.07e-08 ***
## month08 5.679e+04 9.698e+03 5.855 4.84e-09 ***
## month09 5.081e+04 9.757e+03 5.208 1.92e-07 ***
## month10 5.351e+04 9.724e+03 5.503 3.77e-08 ***
## month11 5.425e+04 9.954e+03 5.450 5.10e-08 ***
## month12 5.545e+04 9.914e+03 5.593 2.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160000 on 21504 degrees of freedom
## Multiple R-squared: 0.811, Adjusted R-squared: 0.81
## F-statistic: 854.4 on 108 and 21504 DF, p-value: < 2.2e-16
step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start: AIC=553875.8
## price ~ 1
##
## Df Sum of Sq RSS AIC
## + sqft_living 1 1.4356e+15 1.4773e+15 539204
## + grade 1 1.2976e+15 1.6153e+15 541134
## + zipcode 69 1.1867e+15 1.7262e+15 542706
## + sqft_above 1 1.0682e+15 1.8447e+15 544004
## + sqft_living15 1 9.9816e+14 1.9148e+15 544810
## + bathrooms 1 8.0329e+14 2.1096e+15 546904
## + view 1 4.5978e+14 2.4531e+15 550165
## + bedrooms 12 3.1033e+14 2.6026e+15 551465
## + sqft_basement 1 3.0544e+14 2.6075e+15 551484
## + lat 1 2.7455e+14 2.6384e+15 551738
## + waterfront 1 2.0668e+14 2.7062e+15 552287
## + floors 1 1.9209e+14 2.7208e+15 552403
## + yr_renovated 1 4.6564e+13 2.8664e+15 553529
## + sqft_lot 1 2.3417e+13 2.8895e+15 553703
## + sqft_lot15 1 1.9801e+13 2.8931e+15 553730
## + yr_built 1 8.4977e+12 2.9044e+15 553815
## + condition 1 3.8514e+12 2.9091e+15 553849
## + month 11 4.6632e+12 2.9083e+15 553863
## + long 1 1.3624e+12 2.9116e+15 553868
## <none> 2.9129e+15 553876
## + year 1 3.7251e+10 2.9129e+15 553878
##
## Step: AIC=539203.5
## price ~ sqft_living
##
## Df Sum of Sq RSS AIC
## + zipcode 69 6.9104e+14 7.8624e+14 525710
## + lat 1 2.1314e+14 1.2641e+15 535838
## + view 1 1.2362e+14 1.3537e+15 537317
## + grade 1 1.2132e+14 1.3560e+15 537353
## + waterfront 1 1.1024e+14 1.3670e+15 537529
## + yr_built 1 9.2854e+13 1.3844e+15 537802
## + long 1 6.6817e+13 1.4105e+15 538205
## + bedrooms 12 5.8596e+13 1.4187e+15 538353
## + yr_renovated 1 2.2405e+13 1.4549e+15 538875
## + sqft_living15 1 2.0109e+13 1.4572e+15 538909
## + condition 1 1.7605e+13 1.4597e+15 538946
## + sqft_lot15 1 6.4407e+12 1.4708e+15 539111
## + sqft_lot 1 3.0113e+12 1.4743e+15 539161
## + month 11 4.0868e+12 1.4732e+15 539166
## + year 1 1.6739e+12 1.4756e+15 539181
## + sqft_above 1 1.2165e+12 1.4761e+15 539188
## + sqft_basement 1 1.2165e+12 1.4761e+15 539188
## + floors 1 2.2991e+11 1.4770e+15 539202
## + bathrooms 1 1.4719e+11 1.4771e+15 539203
## <none> 1.4773e+15 539204
##
## Step: AIC=525710.2
## price ~ sqft_living + zipcode
##
## Df Sum of Sq RSS AIC
## + waterfront 1 1.1577e+14 6.7047e+14 522270
## + view 1 9.4436e+13 6.9180e+14 522947
## + grade 1 4.2251e+13 7.4398e+14 524518
## + bedrooms 12 2.7266e+13 7.5897e+14 524971
## + sqft_living15 1 1.6595e+13 7.6964e+14 525251
## + sqft_above 1 8.9744e+12 7.7726e+14 525464
## + sqft_basement 1 8.9744e+12 7.7726e+14 525464
## + yr_renovated 1 4.6850e+12 7.8155e+14 525583
## + year 1 3.9193e+12 7.8232e+14 525604
## + condition 1 3.8480e+12 7.8239e+14 525606
## + yr_built 1 3.6430e+12 7.8259e+14 525612
## + month 11 3.9947e+12 7.8224e+14 525622
## + sqft_lot 1 2.9632e+12 7.8327e+14 525631
## + sqft_lot15 1 1.2435e+12 7.8499e+14 525678
## + long 1 7.0269e+11 7.8553e+14 525693
## + floors 1 5.5501e+11 7.8568e+14 525697
## + lat 1 1.1386e+11 7.8612e+14 525709
## <none> 7.8624e+14 525710
## + bathrooms 1 9.9318e+09 7.8623e+14 525712
##
## Step: AIC=522269.7
## price ~ sqft_living + zipcode + waterfront
##
## Df Sum of Sq RSS AIC
## + grade 1 3.9096e+13 6.3137e+14 520973
## + view 1 3.7180e+13 6.3329e+14 521039
## + bedrooms 12 1.8522e+13 6.5195e+14 521688
## + sqft_living15 1 1.3958e+13 6.5651e+14 521817
## + sqft_above 1 1.0767e+13 6.5970e+14 521922
## + sqft_basement 1 1.0767e+13 6.5970e+14 521922
## + year 1 3.8981e+12 6.6657e+14 522146
## + sqft_lot 1 3.5114e+12 6.6696e+14 522158
## + condition 1 3.3423e+12 6.6713e+14 522164
## + month 11 3.9016e+12 6.6657e+14 522166
## + yr_built 1 1.9059e+12 6.6856e+14 522210
## + yr_renovated 1 1.6654e+12 6.6880e+14 522218
## + sqft_lot15 1 1.5408e+12 6.6893e+14 522222
## + floors 1 6.3045e+11 6.6984e+14 522251
## + lat 1 4.0376e+11 6.7006e+14 522259
## + long 1 2.3133e+11 6.7024e+14 522264
## <none> 6.7047e+14 522270
## + bathrooms 1 3.8057e+06 6.7047e+14 522272
##
## Step: AIC=520973.2
## price ~ sqft_living + zipcode + waterfront + grade
##
## Df Sum of Sq RSS AIC
## + view 1 3.1697e+13 5.9967e+14 519862
## + yr_built 1 1.8348e+13 6.1302e+14 520338
## + bedrooms 12 1.2047e+13 6.1933e+14 520581
## + condition 1 8.7967e+12 6.2257e+14 520672
## + floors 1 7.9792e+12 6.2339e+14 520700
## + sqft_living15 1 4.2135e+12 6.2716e+14 520830
## + year 1 3.9998e+12 6.2737e+14 520838
## + sqft_lot 1 3.5158e+12 6.2786e+14 520854
## + month 11 3.7544e+12 6.2762e+14 520866
## + yr_renovated 1 2.5487e+12 6.2882e+14 520888
## + sqft_above 1 2.1336e+12 6.2924e+14 520902
## + sqft_basement 1 2.1336e+12 6.2924e+14 520902
## + sqft_lot15 1 1.6165e+12 6.2976e+14 520920
## + bathrooms 1 1.6091e+12 6.2976e+14 520920
## + lat 1 2.7263e+11 6.3110e+14 520966
## + long 1 1.4112e+11 6.3123e+14 520970
## <none> 6.3137e+14 520973
##
## Step: AIC=519861.9
## price ~ sqft_living + zipcode + waterfront + grade + view
##
## Df Sum of Sq RSS AIC
## + yr_built 1 1.3437e+13 5.8624e+14 519374
## + bedrooms 12 9.3922e+12 5.9028e+14 519545
## + condition 1 7.1374e+12 5.9254e+14 519605
## + sqft_above 1 5.4833e+12 5.9419e+14 519665
## + sqft_basement 1 5.4833e+12 5.9419e+14 519665
## + floors 1 5.2288e+12 5.9445e+14 519675
## + year 1 3.7044e+12 5.9597e+14 519730
## + month 11 3.5595e+12 5.9611e+14 519755
## + sqft_lot 1 2.6659e+12 5.9701e+14 519768
## + yr_renovated 1 1.8495e+12 5.9782e+14 519797
## + sqft_living15 1 1.2617e+12 5.9841e+14 519818
## + bathrooms 1 1.1539e+12 5.9852e+14 519822
## + sqft_lot15 1 1.1022e+12 5.9857e+14 519824
## + lat 1 3.4922e+11 5.9933e+14 519851
## + long 1 9.9357e+10 5.9957e+14 519860
## <none> 5.9967e+14 519862
##
## Step: AIC=519374.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built
##
## Df Sum of Sq RSS AIC
## + bedrooms 12 9.4873e+12 5.7675e+14 519046
## + sqft_above 1 6.8761e+12 5.7936e+14 519121
## + sqft_basement 1 6.8761e+12 5.7936e+14 519121
## + year 1 3.8284e+12 5.8241e+14 519235
## + month 11 3.6431e+12 5.8259e+14 519261
## + condition 1 2.4804e+12 5.8376e+14 519285
## + sqft_lot 1 1.7107e+12 5.8453e+14 519313
## + sqft_living15 1 7.2759e+11 5.8551e+14 519349
## + floors 1 5.9549e+11 5.8564e+14 519354
## + sqft_lot15 1 5.5026e+11 5.8569e+14 519356
## + lat 1 3.2692e+11 5.8591e+14 519364
## + yr_renovated 1 3.1372e+11 5.8592e+14 519365
## + bathrooms 1 2.1204e+11 5.8603e+14 519368
## + long 1 7.6113e+10 5.8616e+14 519373
## <none> 5.8624e+14 519374
##
## Step: AIC=519045.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms
##
## Df Sum of Sq RSS AIC
## + sqft_above 1 6.5830e+12 5.7017e+14 518799
## + sqft_basement 1 6.5830e+12 5.7017e+14 518799
## + year 1 3.8749e+12 5.7288e+14 518902
## + month 11 3.7848e+12 5.7297e+14 518925
## + condition 1 2.9803e+12 5.7377e+14 518936
## + bathrooms 1 1.2558e+12 5.7549e+14 519000
## + sqft_lot 1 1.2079e+12 5.7554e+14 519002
## + sqft_living15 1 5.9721e+11 5.7615e+14 519025
## + floors 1 3.9660e+11 5.7635e+14 519033
## + yr_renovated 1 2.9137e+11 5.7646e+14 519037
## + lat 1 2.8547e+11 5.7646e+14 519037
## + sqft_lot15 1 2.5491e+11 5.7650e+14 519038
## + long 1 1.0990e+11 5.7664e+14 519043
## <none> 5.7675e+14 519046
##
## Step: AIC=518799.4
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above
##
## Df Sum of Sq RSS AIC
## + condition 1 4.2995e+12 5.6587e+14 518638
## + floors 1 4.2484e+12 5.6592e+14 518640
## + year 1 3.8461e+12 5.6632e+14 518655
## + month 11 3.8309e+12 5.6634e+14 518676
## + bathrooms 1 1.5181e+12 5.6865e+14 518744
## + sqft_lot 1 1.1020e+12 5.6907e+14 518760
## + lat 1 3.3323e+11 5.6983e+14 518789
## + sqft_living15 1 2.6109e+11 5.6991e+14 518791
## + yr_renovated 1 2.3990e+11 5.6993e+14 518792
## + sqft_lot15 1 2.2787e+11 5.6994e+14 518793
## + long 1 1.5276e+11 5.7001e+14 518796
## <none> 5.7017e+14 518799
##
## Step: AIC=518637.8
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition
##
## Df Sum of Sq RSS AIC
## + year 1 4.2813e+12 5.6159e+14 518476
## + floors 1 4.0451e+12 5.6182e+14 518485
## + month 11 4.1925e+12 5.6168e+14 518499
## + bathrooms 1 1.2895e+12 5.6458e+14 518590
## + sqft_lot 1 1.1403e+12 5.6473e+14 518596
## + yr_renovated 1 7.4627e+11 5.6512e+14 518611
## + lat 1 4.1043e+11 5.6546e+14 518624
## + sqft_living15 1 3.7407e+11 5.6549e+14 518626
## + sqft_lot15 1 2.2618e+11 5.6564e+14 518631
## + long 1 1.5578e+11 5.6571e+14 518634
## <none> 5.6587e+14 518638
##
## Step: AIC=518475.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year
##
## Df Sum of Sq RSS AIC
## + floors 1 3.9090e+12 5.5768e+14 518327
## + bathrooms 1 1.3385e+12 5.6025e+14 518426
## + month 11 1.6235e+12 5.5996e+14 518435
## + sqft_lot 1 1.0929e+12 5.6049e+14 518436
## + yr_renovated 1 8.6150e+11 5.6072e+14 518444
## + sqft_living15 1 3.8254e+11 5.6120e+14 518463
## + lat 1 3.8029e+11 5.6121e+14 518463
## + sqft_lot15 1 2.1435e+11 5.6137e+14 518469
## + long 1 1.6422e+11 5.6142e+14 518471
## <none> 5.6159e+14 518476
##
## Step: AIC=518326.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors
##
## Df Sum of Sq RSS AIC
## + bathrooms 1 2.6150e+12 5.5506e+14 518227
## + yr_renovated 1 1.3500e+12 5.5633e+14 518276
## + month 11 1.6328e+12 5.5604e+14 518285
## + sqft_lot 1 8.9860e+11 5.5678e+14 518294
## + lat 1 3.1513e+11 5.5736e+14 518316
## + sqft_living15 1 2.3140e+11 5.5745e+14 518320
## + long 1 2.0375e+11 5.5747e+14 518321
## + sqft_lot15 1 1.1877e+11 5.5756e+14 518324
## <none> 5.5768e+14 518327
##
## Step: AIC=518227.1
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms
##
## Df Sum of Sq RSS AIC
## + month 11 1.6541e+12 5.5341e+14 518185
## + sqft_lot 1 9.1786e+11 5.5414e+14 518193
## + yr_renovated 1 8.9343e+11 5.5417e+14 518194
## + lat 1 3.1540e+11 5.5475e+14 518217
## + sqft_living15 1 2.6851e+11 5.5479e+14 518219
## + long 1 1.9322e+11 5.5487e+14 518222
## + sqft_lot15 1 1.4455e+11 5.5492e+14 518223
## <none> 5.5506e+14 518227
##
## Step: AIC=518184.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month
##
## Df Sum of Sq RSS AIC
## + sqft_lot 1 9.1841e+11 5.5249e+14 518151
## + yr_renovated 1 9.0279e+11 5.5251e+14 518151
## + lat 1 3.1894e+11 5.5309e+14 518174
## + sqft_living15 1 2.4979e+11 5.5316e+14 518177
## + long 1 1.9370e+11 5.5321e+14 518179
## + sqft_lot15 1 1.2745e+11 5.5328e+14 518182
## <none> 5.5341e+14 518185
##
## Step: AIC=518150.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot
##
## Df Sum of Sq RSS AIC
## + yr_renovated 1 9.2234e+11 5.5157e+14 518117
## + lat 1 3.3892e+11 5.5215e+14 518139
## + sqft_living15 1 2.6370e+11 5.5223e+14 518142
## + long 1 2.6016e+11 5.5223e+14 518143
## + sqft_lot15 1 1.2331e+11 5.5237e+14 518148
## <none> 5.5249e+14 518151
##
## Step: AIC=518116.6
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot + yr_renovated
##
## Df Sum of Sq RSS AIC
## + sqft_living15 1 3.3413e+11 5.5123e+14 518106
## + lat 1 3.2969e+11 5.5124e+14 518106
## + long 1 2.7642e+11 5.5129e+14 518108
## + sqft_lot15 1 1.2587e+11 5.5144e+14 518114
## <none> 5.5157e+14 518117
##
## Step: AIC=518105.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot + yr_renovated + sqft_living15
##
## Df Sum of Sq RSS AIC
## + lat 1 3.2184e+11 5.5091e+14 518095
## + long 1 2.7257e+11 5.5096e+14 518097
## + sqft_lot15 1 1.5383e+11 5.5108e+14 518101
## <none> 5.5123e+14 518106
##
## Step: AIC=518094.9
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot + yr_renovated + sqft_living15 + lat
##
## Df Sum of Sq RSS AIC
## + long 1 2.3388e+11 5.5068e+14 518088
## + sqft_lot15 1 1.5297e+11 5.5076e+14 518091
## <none> 5.5091e+14 518095
##
## Step: AIC=518087.7
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot + yr_renovated + sqft_living15 + lat + long
##
## Df Sum of Sq RSS AIC
## + sqft_lot15 1 1.3387e+11 5.5054e+14 518084
## <none> 5.5068e+14 518088
##
## Step: AIC=518084.5
## price ~ sqft_living + zipcode + waterfront + grade + view + yr_built +
## bedrooms + sqft_above + condition + year + floors + bathrooms +
## month + sqft_lot + yr_renovated + sqft_living15 + lat + long +
## sqft_lot15
##
## Df Sum of Sq RSS AIC
## <none> 5.5054e+14 518084
##
## Call:
## lm(formula = price ~ sqft_living + zipcode + waterfront + grade +
## view + yr_built + bedrooms + sqft_above + condition + year +
## floors + bathrooms + month + sqft_lot + yr_renovated + sqft_living15 +
## lat + long + sqft_lot15, data = df)
##
## Coefficients:
## (Intercept) sqft_living zipcode98002 zipcode98003 zipcode98004
## -2.517e+07 1.305e+02 3.135e+04 -2.400e+04 7.159e+05
## zipcode98005 zipcode98006 zipcode98007 zipcode98008 zipcode98010
## 2.477e+05 2.278e+05 1.950e+05 2.055e+05 9.688e+04
## zipcode98011 zipcode98014 zipcode98019 zipcode98022 zipcode98023
## 3.540e+04 7.830e+04 3.875e+04 4.171e+04 -4.618e+04
## zipcode98024 zipcode98027 zipcode98028 zipcode98029 zipcode98030
## 1.483e+05 1.512e+05 2.876e+04 1.921e+05 1.095e+03
## zipcode98031 zipcode98032 zipcode98033 zipcode98034 zipcode98038
## 3.575e+03 -7.741e+03 2.933e+05 1.230e+05 4.720e+04
## zipcode98039 zipcode98040 zipcode98042 zipcode98045 zipcode98052
## 1.251e+06 4.576e+05 1.001e+04 1.249e+05 1.673e+05
## zipcode98053 zipcode98055 zipcode98056 zipcode98058 zipcode98059
## 1.425e+05 2.101e+04 6.305e+04 1.335e+04 5.996e+04
## zipcode98065 zipcode98070 zipcode98072 zipcode98074 zipcode98075
## 8.599e+04 -6.859e+04 7.303e+04 1.328e+05 1.350e+05
## zipcode98077 zipcode98092 zipcode98102 zipcode98103 zipcode98105
## 5.075e+04 -2.432e+04 4.413e+05 2.532e+05 3.914e+05
## zipcode98106 zipcode98107 zipcode98108 zipcode98109 zipcode98112
## 8.882e+04 2.586e+05 7.375e+04 4.182e+05 5.478e+05
## zipcode98115 zipcode98116 zipcode98117 zipcode98118 zipcode98119
## 2.491e+05 2.188e+05 2.264e+05 1.212e+05 3.960e+05
## zipcode98122 zipcode98125 zipcode98126 zipcode98133 zipcode98136
## 2.724e+05 1.128e+05 1.328e+05 6.827e+04 1.827e+05
## zipcode98144 zipcode98146 zipcode98148 zipcode98155 zipcode98166
## 2.178e+05 5.658e+04 3.643e+04 4.950e+04 1.402e+04
## zipcode98168 zipcode98177 zipcode98178 zipcode98188 zipcode98198
## 3.790e+04 1.140e+05 5.822e+03 6.150e+03 -2.439e+04
## zipcode98199 waterfront grade view yr_built
## 2.988e+05 6.547e+05 5.746e+04 5.477e+04 -7.051e+02
## bedrooms1 bedrooms2 bedrooms3 bedrooms4 bedrooms5
## -3.878e+04 -8.181e+04 -1.132e+05 -1.499e+05 -1.608e+05
## bedrooms6 bedrooms7 bedrooms8 bedrooms9 bedrooms10
## -1.756e+05 -2.787e+05 -1.483e+05 -3.662e+05 -4.744e+05
## bedrooms11 bedrooms33 sqft_above condition year2015
## -2.723e+05 -6.439e+04 7.994e+01 2.702e+04 6.119e+04
## floors bathrooms month02 month03 month04
## -4.471e+04 2.424e+04 5.761e+03 2.664e+04 3.245e+04
## month05 month06 month07 month08 month09
## 4.791e+04 5.923e+04 5.325e+04 5.679e+04 5.081e+04
## month10 month11 month12 sqft_lot yr_renovated
## 5.351e+04 5.425e+04 5.545e+04 2.422e-01 1.839e+01
## sqft_living15 lat long sqft_lot15
## 1.067e+01 2.102e+05 -1.313e+05 -1.378e-01
library(MASS)
ind <- sapply(df, is.numeric)
df[ind] <- lapply(df[ind], scale)
lm4<-lm(price~ ( grade + zipcode + sqft_living + waterfront + view + condition +
year + yr_renovated + sqft_above + bedrooms:bathrooms ),data=df)
summary(lm4)
##
## Call:
## lm(formula = price ~ (grade + zipcode + sqft_living + waterfront +
## view + condition + year + yr_renovated + sqft_above + bedrooms:bathrooms),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5289 -0.1884 -0.0004 0.1690 11.0437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.583250 0.023151 -25.194 < 2e-16 ***
## grade 0.194013 0.005406 35.890 < 2e-16 ***
## zipcode98002 0.044481 0.038674 1.150 0.250098
## zipcode98003 -0.026238 0.034812 -0.754 0.451026
## zipcode98004 2.134094 0.034037 62.699 < 2e-16 ***
## zipcode98005 0.868687 0.041076 21.148 < 2e-16 ***
## zipcode98006 0.730462 0.030683 23.807 < 2e-16 ***
## zipcode98007 0.668774 0.043505 15.372 < 2e-16 ***
## zipcode98008 0.701716 0.034801 20.163 < 2e-16 ***
## zipcode98010 0.197724 0.049435 4.000 6.36e-05 ***
## zipcode98011 0.363206 0.038880 9.342 < 2e-16 ***
## zipcode98014 0.339819 0.045545 7.461 8.91e-14 ***
## zipcode98019 0.249683 0.039215 6.367 1.97e-10 ***
## zipcode98022 -0.001084 0.036823 -0.029 0.976509
## zipcode98023 -0.079214 0.030206 -2.623 0.008735 **
## zipcode98024 0.499822 0.053794 9.291 < 2e-16 ***
## zipcode98027 0.465261 0.031644 14.703 < 2e-16 ***
## zipcode98028 0.346521 0.034712 9.983 < 2e-16 ***
## zipcode98029 0.569500 0.033739 16.880 < 2e-16 ***
## zipcode98030 0.013597 0.035706 0.381 0.703346
## zipcode98031 0.040355 0.035027 1.152 0.249288
## zipcode98032 0.005807 0.045397 0.128 0.898223
## zipcode98033 0.988031 0.031260 31.607 < 2e-16 ***
## zipcode98034 0.567286 0.029668 19.121 < 2e-16 ***
## zipcode98038 0.089311 0.029262 3.052 0.002275 **
## zipcode98039 3.606290 0.066500 54.230 < 2e-16 ***
## zipcode98040 1.372971 0.035335 38.856 < 2e-16 ***
## zipcode98042 0.005763 0.029634 0.194 0.845794
## zipcode98045 0.283421 0.037341 7.590 3.33e-14 ***
## zipcode98052 0.630367 0.029478 21.384 < 2e-16 ***
## zipcode98053 0.588771 0.031845 18.489 < 2e-16 ***
## zipcode98055 0.138771 0.035241 3.938 8.25e-05 ***
## zipcode98056 0.243115 0.031656 7.680 1.66e-14 ***
## zipcode98058 0.081937 0.030814 2.659 0.007840 **
## zipcode98059 0.217831 0.030694 7.097 1.32e-12 ***
## zipcode98065 0.230533 0.034011 6.778 1.25e-11 ***
## zipcode98070 0.003081 0.046971 0.066 0.947709
## zipcode98072 0.449663 0.035135 12.798 < 2e-16 ***
## zipcode98074 0.490007 0.031300 15.655 < 2e-16 ***
## zipcode98075 0.455420 0.032995 13.803 < 2e-16 ***
## zipcode98077 0.379010 0.038965 9.727 < 2e-16 ***
## zipcode98092 -0.096031 0.032791 -2.929 0.003408 **
## zipcode98102 1.413984 0.048699 29.035 < 2e-16 ***
## zipcode98103 0.907223 0.029250 31.017 < 2e-16 ***
## zipcode98105 1.306566 0.037107 35.211 < 2e-16 ***
## zipcode98106 0.379891 0.033298 11.409 < 2e-16 ***
## zipcode98107 0.922281 0.035489 25.988 < 2e-16 ***
## zipcode98108 0.350428 0.039536 8.863 < 2e-16 ***
## zipcode98109 1.391383 0.047933 29.028 < 2e-16 ***
## zipcode98112 1.715699 0.035528 48.291 < 2e-16 ***
## zipcode98115 0.906588 0.029426 30.809 < 2e-16 ***
## zipcode98116 0.803669 0.033508 23.984 < 2e-16 ***
## zipcode98117 0.877180 0.029738 29.497 < 2e-16 ***
## zipcode98118 0.463000 0.030211 15.325 < 2e-16 ***
## zipcode98119 1.304632 0.039851 32.738 < 2e-16 ***
## zipcode98122 0.939959 0.034654 27.124 < 2e-16 ***
## zipcode98125 0.550941 0.031603 17.433 < 2e-16 ***
## zipcode98126 0.530454 0.032900 16.123 < 2e-16 ***
## zipcode98133 0.438414 0.030334 14.453 < 2e-16 ***
## zipcode98136 0.676487 0.035633 18.985 < 2e-16 ***
## zipcode98144 0.755502 0.033110 22.818 < 2e-16 ***
## zipcode98146 0.302789 0.034637 8.742 < 2e-16 ***
## zipcode98148 0.231965 0.062308 3.723 0.000197 ***
## zipcode98155 0.406197 0.030993 13.106 < 2e-16 ***
## zipcode98166 0.191745 0.035942 5.335 9.66e-08 ***
## zipcode98168 0.217740 0.035375 6.155 7.63e-10 ***
## zipcode98177 0.632605 0.035980 17.582 < 2e-16 ***
## zipcode98178 0.123924 0.035616 3.479 0.000503 ***
## zipcode98188 0.095027 0.043989 2.160 0.030764 *
## zipcode98198 0.004311 0.034887 0.124 0.901649
## zipcode98199 1.080174 0.033948 31.818 < 2e-16 ***
## sqft_living 0.343773 0.007743 44.400 < 2e-16 ***
## waterfront 0.155790 0.003317 46.962 < 2e-16 ***
## view 0.123274 0.003582 34.410 < 2e-16 ***
## condition 0.066713 0.003204 20.822 < 2e-16 ***
## year2015 0.082389 0.006390 12.894 < 2e-16 ***
## yr_renovated 0.032288 0.003048 10.592 < 2e-16 ***
## sqft_above 0.109962 0.007161 15.355 < 2e-16 ***
## bedrooms0:bathrooms -0.096997 0.056883 -1.705 0.088172 .
## bedrooms1:bathrooms -0.190070 0.022267 -8.536 < 2e-16 ***
## bedrooms2:bathrooms -0.130688 0.008587 -15.219 < 2e-16 ***
## bedrooms3:bathrooms -0.073400 0.006145 -11.944 < 2e-16 ***
## bedrooms4:bathrooms 0.047023 0.007307 6.435 1.26e-10 ***
## bedrooms5:bathrooms 0.127393 0.009347 13.629 < 2e-16 ***
## bedrooms6:bathrooms 0.192241 0.015044 12.778 < 2e-16 ***
## bedrooms7:bathrooms -0.120702 0.027079 -4.457 8.34e-06 ***
## bedrooms8:bathrooms 0.152619 0.053974 2.828 0.004693 **
## bedrooms9:bathrooms -0.138864 0.050116 -2.771 0.005596 **
## bedrooms10:bathrooms -0.347320 0.103494 -3.356 0.000792 ***
## bedrooms11:bathrooms -0.199408 0.381040 -0.523 0.600753
## bedrooms33:bathrooms -0.471070 0.923604 -0.510 0.610032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.437 on 21522 degrees of freedom
## Multiple R-squared: 0.8098, Adjusted R-squared: 0.809
## F-statistic: 1018 on 90 and 21522 DF, p-value: < 2.2e-16
library(MASS)
ind <- sapply(df, is.numeric)
df[ind] <- lapply(df[ind], scale)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.3
library(rpart.plot)
regTree <- rpart(price ~ .+ sqft_living + zipcode + waterfront + view + yr_built+yr_renovated + bedrooms+bathrooms + sqft_above + condition +floors, data= df, method = "anova")
Regression tree
plot(regTree,uniform = TRUE,main=" Regression tree")
text(regTree,use.n=TRUE,cex=.6)
## Warning in labels.rpart(x, minlength = minlength): more than 52 levels in a
## predicting factor, truncated for printout
par(mfrow=c(1,2))
rsq.rpart(regTree)
##
## Regression tree:
## rpart(formula = price ~ . + sqft_living + zipcode + waterfront +
## view + yr_built + yr_renovated + bedrooms + bathrooms + sqft_above +
## condition + floors, data = df, method = "anova")
##
## Variables actually used in tree construction:
## [1] grade sqft_living zipcode
##
## Root node error: 21612/21613 = 0.99995
##
## n= 21613
##
## CP nsplit rel error xerror xstd
## 1 0.320270 0 1.00000 1.00016 0.041152
## 2 0.114782 1 0.67973 0.68012 0.032949
## 3 0.078560 2 0.56495 0.56908 0.024670
## 4 0.052578 3 0.48639 0.49051 0.024574
## 5 0.049445 4 0.43381 0.47198 0.024392
## 6 0.031927 5 0.38436 0.40071 0.019704
## 7 0.020928 6 0.35244 0.39072 0.017950
## 8 0.017918 7 0.33151 0.37025 0.017848
## 9 0.012805 8 0.31359 0.34132 0.013040
## 10 0.011242 9 0.30079 0.32594 0.012935
## 11 0.010000 10 0.28955 0.31565 0.012888
Compare the output between these two methods. Is there one that you would choose over the other?
Answer: From this, we get to know thar the std error for regrssion trees is 0.31 which less than that of the Linear regression model which is 0.40. Thus, regression tree model has reduced its error & is preffered to the other.
# PART 2
Auto= read.csv('Auto.csv')
Creating a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function.
library(MASS)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.2
##
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
Exploring the data graphically in order to investigate the association between mpg01 and the other features.
cor(Auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
#Scatterplot matrix
pairs(Auto[,-9])
in scatter plot horsepower and weight have high significance with year
#Boxplots
par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")
some association between “mpg01” and “cylinders”, “weight”, “displacement” and “horsepower” and acceleration look the most promissing.
# splitting the train and test set into 80% and 20%
set.seed(1)
rows <- sample(x=nrow(Auto), size=.80*nrow(Auto))
trainset <- Auto[rows, ]
testset <- Auto[-rows, ]
# LDA
library(MASS)
lda.fit <- lda(mpg01 ~ (displacement+acceleration+horsepower:year+weight+cylinders), data=trainset)
lda.pred <- predict(lda.fit, testset)
table(testset$mpg01, lda.pred$class)
##
## 0 1
## 0 32 3
## 1 0 44
round(sum(lda.pred$class!=testset$mpg01)/nrow(testset)*100,2)
## [1] 3.8
Test errror is 3.8%
#logistic regression
lr.fit <- glm(as.factor(mpg01) ~ (displacement+acceleration+horsepower:year+weight+cylinders), data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>0.5, "1", "0")
table(testset$mpg01, lr.pred)
## lr.pred
## 0 1
## 0 32 3
## 1 2 42
# test-error
round(sum(lr.pred!=testset$mpg01)/nrow(testset)*100,2)
## [1] 6.33
Test errror is 6.33%
Performing KNN on the training data
data = scale(Auto[,-c(9,10)])
set.seed(1234)
train <- sample(1:dim(Auto)[1], 392*.7, rep=FALSE)
#train <- sample(1:dim(Auto)[1], dim(Auto)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train,c("cylinders","weight","displacement","horsepower","year","acceleration")]
testing_data = data[test,c("cylinders","weight","displacement","horsepower","year","acceleration")]
## KNN take the training response variable seperately
train.mpg01 = Auto$mpg01[train]
## we also need the have the testing_y seperately for assesing the model later on
test.mpg01= Auto$mpg01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 1)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 51 4
## 1 5 58
mean(knn_pred_y != test.mpg01)
## [1] 0.07627119
#Using a for loop to find the optimum K value
knn_pred_y = NULL
error_rate = NULL
for(i in 1:dim(testing_data)[1]){
set.seed(1234)
knn_pred_y = knn(training_data,testing_data,train.mpg01,k=i)
error_rate[i] = mean(test.mpg01 != knn_pred_y)
}
### find the minimum error rate
min_error_rate = min(round((error_rate)*100,2))
print(min_error_rate)
## [1] 5.93
The minimum error rate is 5.93%
### get the index of that error rate, which is the k
K = which(error_rate == min_error_rate)
print(K)
## integer(0)
# When we train a KNN model with k=3, then we get the lowest misclassification error rate of 5.93%.
library(ggplot2)
qplot(1:dim(testing_data)[1], error_rate, xlab = "K",
ylab = "Error Rate",
geom=c("point", "line"))
Which value of K seems to perform the best on this data set? Answer:3 ——————————————————————————————————————————————————————————————————
PART 3
DBSCAN
data<-read.csv('clustering1.csv')
df2<-read.csv('clustering2.csv')
df3<-read.csv('clustering3.csv')
df4<-read.csv('clustering4.csv')
library(dbscan)
library("fpc")
## Warning: package 'fpc' was built under R version 3.4.3
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
set.seed(1234)
db1<-fpc::dbscan(data,eps=2,MinPts=80)
db2<-fpc::dbscan(df2,eps=0.5,MinPts=10)
db3<-fpc::dbscan(df3,eps=0.15,MinPts=5)
db4<-fpc::dbscan(df4,eps=0.18,MinPts=10)
print(db1)
## dbscan Pts=1499 MinPts=80 eps=2
## 1 2 3
## border 8 11 7
## seed 491 489 493
## total 499 500 500
print(db2)
## dbscan Pts=1499 MinPts=10 eps=0.5
## 1
## seed 1499
## total 1499
print(db3)
## dbscan Pts=1499 MinPts=5 eps=0.15
## 1 2
## border 1 0
## seed 748 750
## total 749 750
print(db4)
## dbscan Pts=1499 MinPts=10 eps=0.18
## 1 2
## seed 750 749
## total 750 749
library("factoextra")
## Warning: package 'factoextra' was built under R version 3.4.1
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
dp1<-fviz_cluster(db1,data=data,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
geom="point",palette="jco",ggtheme=theme_classic())
dp2<-fviz_cluster(db2,data=df2,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
geom="point",palette="jco",ggtheme=theme_classic())
dp3<-fviz_cluster(db3,data=df3,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
geom="point",palette="jco",ggtheme=theme_classic())
dp4<-fviz_cluster(db4,data=df4,stand=FALSE,ellipse = FALSE,show.clust.cent = FALSE,
geom="point",palette="jco",ggtheme=theme_classic())
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.1
grid.arrange(dp1,dp2,dp3,dp4,nrow=2)
| Hierarchical clustering |
r data <- na.omit(data) data <- scale(data) head(data) |
## X5.867498067335642276e.00 X8.177151880030342213e.00 ## 1 0.3971268 1.24070227 ## 2 0.6611644 1.30575784 ## 3 0.5854193 0.06447004 ## 4 0.7907919 0.18275159 ## 5 0.8539870 -0.05893430 ## 6 0.5443075 1.22563007 |
r d <- dist(data, method = "euclidean") hc1 <- hclust(d, method = "complete" ) plot(hc1, cex = 0.6, hang = -1) |
r segtree<-cutree(hc1,k=3) table(segtree) |
## segtree ## 1 2 3 ## 499 500 500 |
r plot(hc1,cex=0.6) rect.hclust(hc1,k=3,border=2.5) |
r fviz_cluster(list(data=data,cluster=segtree)) |
r d1 <- dist(df2, method = "euclidean") hc2 <- hclust(d1, method = "complete" ) plot(hc2, cex = 0.6, hang = -1) |
r segtree1<-cutree(hc2,k=3) table(segtree1) |
## segtree1 ## 1 2 3 ## 459 617 423 |
r plot(hc2,cex=0.6) rect.hclust(hc2,k=3,border=2.5) |
r fviz_cluster(list(data=df2,cluster=segtree1)) |
r d3 <- dist(df3, method = "euclidean") hc3 <- hclust(d3, method = "complete" ) plot(hc3, cex = 0.6, hang = -1) |
r segtree3<-cutree(hc3,k=3) table(segtree3) |
## segtree3 ## 1 2 3 ## 574 569 356 |
r plot(hc3,cex=0.6) rect.hclust(hc3,k=3,border=2.5) |
r fviz_cluster(list(data=df3,cluster=segtree3)) |
r d4 <- dist(df4, method = "euclidean") hc4 <- hclust(d4, method = "complete" ) plot(hc4, cex = 0.6, hang = -1) |
r segtree4<-cutree(hc4,k=3) table(segtree4) |
## segtree4 ## 1 2 3 ## 637 576 286 |
r plot(hc4,cex=0.6) rect.hclust(hc4,k=3,border=2.5) |
r fviz_cluster(list(data=df4,cluster=segtree4)) |
K-MEANS
k1<-kmeans(data,centers = 4,nstart = 25)
k2<-kmeans(df2,centers = 8,nstart = 25)
k3<-kmeans(df3,centers = 15,nstart = 25)
k4<-kmeans(df4,centers = 20,nstart = 25)
pt1<-fviz_cluster(k1,geom="point",data=data)+ggtitle("Kmeans - Clustering 1 dataset")
pt2<-fviz_cluster(k2,geom="point",data=df2)+ggtitle("Kmeans - Clustering 2 dataset")
pt3<-fviz_cluster(k3,geom="point",data=df3)+ggtitle("Kmeans - Clustering 3 dataset")
pt4<-fviz_cluster(k4,geom="point",data=df4)+ggtitle("Kmeans - Clustering 4 dataset")
library(gridExtra)
grid.arrange(pt1,pt2,pt3,pt4,nrow=2)
PART 4
1.Suppose we have a dataset with five predictors, X1=GPA, X2=IQ, X3=Gender (1 for Female, 0 for Male), X4=Interaction between GPA and IQ, and X5=Interaction between GPA and Gender. The response is the starting salary after graduation (in thousands of dollars). Suppose we use least squares to fit the model, and get β0^=50, β̂ 1=20, β̂ 2=.07, β̂ 3=35, β̂ 4=0.01, and β̂ 5 For a fixed value of IQ and GPA, males earn more on average than females. For a fixed value of IQ and GPA, females earn more on average than males. For a fixed value of IQ and GPA, males earn more on average than females provided that the GPA is high enough. For a fixed value of IQ and GPA, females earn more on average than males provided that the GPA is high enough.
Answer: The correct answer is (iii) i.e. For a fixed value of IQ and GPA,males earn more on average than females provided that the GPA is high enough.If males are 0 and females are 1, then male is the baseline. It’s clear from the sign of β3 that on average women earn more than men if both have zero GPA and zero IQ. However, as GPA increases, average wages become relatively higher for men (β5<0). Therefore, if GPA is high enough, men will earn more than women, on average. if the gpa is low answer b is correct.
B)Predict the salary of a female with IQ of 110 and a GPA of 4.0.
Answer: The formala for linear regression is Y = β[0] +β[1]X +β[2]X^2 +β[3]X^3 + e
which is equal to 85+10GPA+0.07IQ+0.01GPA*IQ
Y=85+104+0.07110+0.014110 y=85+40+7.7+4.4 y=137.1 which is $137100 for the female’s starting salary!
Suppose we collect data for a group of students in a statistics class with variables X1=hours studied, X2=undergrad GPA, and Y=receive an A. We fit a logistic regression and produce estimated coefficient, β̂ 0=−6, β̂ 1=0.05, β̂ 2=1. Estimate the probability that a student who studies for 40 h and has an undergrad GPA of 3.5 gets an A in the class.
Answer: Given:
β0 = -6 β1 = 0.05 β2 = 1 and X1=40 & X2=3.5 Probability = (exp(1)^(β0 + (β140) + (β23.5)))/(1+exp(1)^(β0 + (β140) + (β23.5))) Probability= 37.75%
How many hours would the student in part (a) need to study to have a 50 % chance of getting an A in the class?
Answer: To increase the chance of A without alter the GPA, the student have to increase the number of hours, so we test a sequence of hours and see how the chances change. Doing the solution we find that x1 equal to 50 hours, therefore to have 50% of chance, he needs to study at least 50 hours. ——————————————————————————————————————————————————————————————————
n=6
x=matrix(c(1,4,1,3,0,4,5,1,6,2,4,0),nrow=n,byrow=T)
plot(x)
Randomly assign a cluster label to each observation. Report the cluster labels for each observation.
set.seed(1)
labels <- sample(2, nrow(x), replace = T)
labels
## [1] 1 1 2 2 1 2
plot(x, col = (labels + 1), pch = 20, cex = 2)
Compute the centroid for each cluster.
centroid1 <- c(mean(x[labels == 1, 1]), mean(x[labels == 1, 2]))
centroid2 <- c(mean(x[labels == 2, 1]), mean(x[labels == 2, 2]))
plot(x[,1], x[,2], col=(labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)
Assign each observation to the centroid to which it is closest, in terms of Euclidean distance. Report the cluster labels for each observation.
labels <- c(1, 1, 1, 2, 2, 2)
plot(x[, 1], x[, 2], col = (labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)
Repeat (c) and (d) until the answers obtained stop changing.
centroid1 <- c(mean(x[labels == 1, 1]), mean(x[labels == 1, 2]))
centroid2 <- c(mean(x[labels == 2, 1]), mean(x[labels == 2, 2]))
plot(x[,1], x[,2], col=(labels + 1), pch = 20, cex = 2)
points(centroid1[1], centroid1[2], col = 2, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)
In your plot from (a), color the observations according to the clusters labels obtained.
plot(x[, 1], x[, 2], col=(labels + 1), pch = 20, cex = 2)